summaryrefslogtreecommitdiff
path: root/bloom/bf.c (plain)
blob: 728bb6d954985e1967e8fd0a9cce9cdcd6fe47fd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#include <stdio.h>
#include <errno.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
 
int n=16;
int verbose=0;
char *ifile=NULL;
char *tfile=NULL;
 
void usage(char *prog) {
  fprintf(stderr, "usage: %s [-n <2|4|8|...|32>] [-v] <source.txt> <test.txt>\n", prog);
  exit(-1);
}

unsigned hash_ber(char *in, size_t len) {
  unsigned hashv = 0;
  while (len--)  hashv = ((hashv) * 33) + *in++;
  return hashv;
}
unsigned hash_fnv(char *in, size_t len) {
  unsigned hashv = 2166136261UL;
  while(len--) hashv = (hashv * 16777619) ^ *in++;
  return hashv;
}
#define MASK(u,n) ( u & ((1UL << n) - 1))
#define NUM_HASHES 2
void get_hashv(char *in, size_t len, unsigned *out) {
  assert(NUM_HASHES==2);
  out[0] = MASK(hash_ber(in,len),n);
  out[1] = MASK(hash_fnv(in,len),n);
}

#define BIT_TEST(c,i) (c[i/8] & (1 << (i % 8)))
#define BIT_SET(c,i) (c[i/8] |= (1 << (i % 8)))
#define byte_len(n) (((1UL << n) / 8) + (((1UL << n) % 8) ? 1 : 0))
#define num_bits(n) (1UL << n)
char *bf_new(unsigned n) {
  char *bf = calloc(1,byte_len(n));
  return bf;
}
void bf_add(char *bf, char *line) {
  unsigned i, hashv[NUM_HASHES];
  get_hashv(line,strlen(line),hashv);
  for(i=0;i<NUM_HASHES;i++) BIT_SET(bf,hashv[i]);
}
void bf_info(char *bf, FILE *f) {
  unsigned i, on=0;
  for(i=0; i<num_bits(n); i++) 
    if (BIT_TEST(bf,i)) on++;

  fprintf(f, "%.2f%% saturation (%lu bits)\n", on*100.0/num_bits(n), num_bits(n));
}
int bf_hit(char *bf, char *line) {
  unsigned i, hashv[NUM_HASHES];
  get_hashv(line,strlen(line),hashv);
  for(i=0;i<NUM_HASHES;i++) {
    if (BIT_TEST(bf,hashv[i])==0) return 0;
  }
  return 1;
}
 
int main(int argc, char * argv[]) {
  int opt;
  FILE *ifilef=stdin,*tfilef=NULL;
  char line[100];
 
  while ( (opt = getopt(argc, argv, "n:v+")) != -1) {
    switch (opt) {
      case 'n':
        n = atoi(optarg);
        break;
      case 'v':
        verbose++;
        break;
      default:
        usage(argv[0]);
        break;
    }
  }
 
  if (optind < argc) ifile=argv[optind++];
  if (optind < argc) tfile=argv[optind++];
  if (!ifile || !tfile) usage(argv[0]);

  /* open files */
  if ( (ifilef = fopen(ifile,"r")) == NULL) {
    fprintf(stderr,"can't open %s: %s\n", ifile, strerror(errno));
    exit(-1);
  }
  if ( (tfilef = fopen(tfile,"r")) == NULL) {
    fprintf(stderr,"can't open %s: %s\n", tfile, strerror(errno));
    exit(-1);
  }

  /* make the bloom filter */
  char *bf= bf_new(n);
 
  /* loop over the source file */
  while (fgets(line,sizeof(line),ifilef) != NULL) bf_add(bf,line);

  /* print saturation etc */
  if (verbose) bf_info(bf,stderr); 

  /* now loop over the test file */
  int hit=0, miss=0;
  while (fgets(line,sizeof(line),tfilef) != NULL) {
    if (bf_hit(bf,line)) {
      hit++;
      if (verbose>1) fprintf(stderr,"hit on %s\n", line);
    } else {
      miss++;
    }
  }

  double hitrate = (hit+miss > 0) ? hit*100.0/(hit+miss) : 0;
  printf("%.2f%% hit rate (%u/%u) n=%u\n", hitrate, hit, hit+miss, n);
}