1- require 'bitset' # gem
21require 'zlib' # stdlib
3- require 'digest ' # stdlib
2+ require 'bitset ' # gem
43
54class BloomFilter
6- # return an array of bit indices ("on bits") via repeated string hashing
7- # start with the fastest/cheapest algos, up to 8 rounds
8- # beyond that, perform cyclic "hashing" with CRC32
9- def self . hash_bits ( str , num_hashes :, num_bits :)
10- val = 0 # for cyclic hashing
11- Array . new ( num_hashes ) { |i |
12- case i
13- when 0 then str . hash
14- when 1 then Zlib . crc32 ( str )
15- when 2 then Digest ::MD5 . hexdigest ( str ) . to_i ( 16 )
16- when 3 then Digest ::SHA1 . hexdigest ( str ) . to_i ( 16 )
17- when 4 then Digest ::SHA256 . hexdigest ( str ) . to_i ( 16 )
18- when 5 then Digest ::SHA384 . hexdigest ( str ) . to_i ( 16 )
19- when 6 then Digest ::SHA512 . hexdigest ( str ) . to_i ( 16 )
20- when 7 then Digest ::RMD160 . hexdigest ( str ) . to_i ( 16 )
21- else # cyclic hashing with CRC32
22- val = Zlib . crc32 ( str , val )
23- end % num_bits
24- }
25- end
5+ MAX_BITS = 2 **32 # CRC32 yields 32-bit values
266
27- attr_reader :bitmap
7+ attr_reader :bits , :aspects , : bitmap
288
299 # The default values require 8 kilobytes of storage and recognize:
30- # < 4000 strings: FPR 0.1%
31- # < 7000 strings: FPR 1%
32- # > 10k strings: FPR 5%
33- # The false positive rate goes up as more strings are added
34- def initialize ( num_bits : 2 **16 , num_hashes : 5 )
35- @num_bits = num_bits
36- @num_hashes = num_hashes
37- @bitmap = Bitset . new ( @num_bits )
10+ # < 7000 strings at 1% False Positive Rate (4k @ 0.1%) (10k @ 5%)
11+ # FPR goes up as more strings are added
12+ def initialize ( bits : 2 **16 , aspects : 5 )
13+ @bits = bits
14+ raise ( "bits: #{ @bits } " ) if @bits > MAX_BITS
15+ @aspects = aspects
16+ @bitmap = Bitset . new ( @bits )
3817 end
3918
40- def hash_bits ( str )
41- self . class . hash_bits ( str , num_hashes : @num_hashes , num_bits : @num_bits )
19+ # Return an array of bit indices ("on bits") corresponding to
20+ # multiple rounds of string hashing (CRC32 is fast and ~fine~)
21+ def aspect_bits ( str )
22+ val = 0
23+ Array . new ( @aspects ) { ( val = Zlib . crc32 ( str , val ) ) % @bits }
4224 end
4325
4426 def add ( str )
45- @bitmap . set *self . hash_bits ( str )
27+ @bitmap . set ( *self . aspect_bits ( str ) )
4628 end
4729 alias_method ( :<< , :add )
4830
31+ # true or false; a `true` result may be a "false positive"
4932 def include? ( str )
50- @bitmap . set? *self . hash_bits ( str )
33+ @bitmap . set? ( *self . aspect_bits ( str ) )
5134 end
5235
36+ # returns either 0 or a number like 0.95036573
5337 def likelihood ( str )
5438 self . include? ( str ) ? 1.0 - self . fpr : 0
5539 end
5640 alias_method ( :[] , :likelihood )
5741
42+ # relatively expensive; don't test against this in a loop
5843 def percent_full
59- @bitmap . to_a . count . to_f / @num_bits
44+ @bitmap . to_a . count . to_f / @bits
6045 end
6146
6247 def fpr
63- self . percent_full **@num_hashes
48+ self . percent_full **@aspects
6449 end
6550
6651 def to_s
67- format ( "%i bits (%.1f kB, %i hashes ) %i%% full; FPR: %.3f%%" ,
68- @num_bits , @num_bits . to_f / 2 **13 , @num_hashes ,
52+ format ( "%i bits (%.1f kB, %i aspects ) %i%% full; FPR: %.3f%%" ,
53+ @bits , @bits . to_f / 2 **13 , @aspects ,
6954 self . percent_full * 100 , self . fpr * 100 )
7055 end
7156 alias_method ( :inspect , :to_s )
@@ -76,7 +61,7 @@ def to_s
7661 puts "Two empty lines to quit"
7762 puts
7863
79- bf = BloomFilter . new ( num_bits : 512 , num_hashes : 5 )
64+ bf = BloomFilter . new ( bits : 512 , aspects : 5 )
8065 num = 0
8166 last = ''
8267
0 commit comments