I thought I'd try implementing a Bloom filter in Python, just for fun, since Python does have a nice abstraction for large bit vectors: the long int, or bignum. Turns out it's grossly inefficient to use a single long int for a Bloom filter because there's no way to set or clear bits in it.
I wrote this on Beatrice's Mac with Python 2.3, which sucked, but was better than not programming. #!/usr/bin/python """Bloom filters in Python, using SHA-1 and Python longs. My first attempt stored the whole filter in a single arbitrary-size integer, but for some reason that was 100x slower than storing it in a bunch of 256-bit integers. """ import sha def nbits_required(n): """Bits required to represent any integer in [0, n).""" n -= 1 rv = 0 while n: n >>= 1 rv += 1 return rv class Bloom: """Bloom filter: compact hash table for membership tests with false pos.""" # default bits per bucket is 256 to cut down on pickle overhead def __init__(self, size, nhashes, bucketbits=256): """size: number of bits. Should be a power of 2. nhashes: number of separate hashes to use. Making nhashes larger will make it slower. There are also tradeoffs between size, performance, and false-positive rate, which you can look up elsewhere.""" self.bucketbits = bucketbits self.filter = [0L] * int((size + bucketbits - 1) / bucketbits) self.size = size self.nhashes = nhashes self.hashbits = nbits_required(size) assert self.hashbits * nhashes <= 160 # 160's all we get with SHA1 def add(self, astr): """Add a string to the membership of the filter.""" for offset in self._hashes(astr): bucket, bit = divmod(offset, self.bucketbits) self.filter[bucket] |= (1L << bit) def __contains__(self, astr): """Returns true if the string is in the filter or it feels like it.""" for offset in self._hashes(astr): bucket, bit = divmod(offset, self.bucketbits) if not self.filter[bucket] & (1L << bit): return 0 return 1 def _hashes(self, astr): """The hashes of a particular string.""" digest = sha.sha(astr).digest() # is there no better way to convert a byte string into a long?! hashlong = 0L for ch in digest: hashlong = (hashlong << 8) | ord(ch) rv = [] mask = (1L << self.hashbits) - 1 for ii in range(self.nhashes): # Note that this will give substantially nonuniform results if # self.size is not a power of 2, in order to avoid wasting hash # bits or doing long division: rv.append((hashlong & mask) % self.size) hashlong >>= self.hashbits return rv def test_bloom(): """Very basic sanity test for Bloom filter implementation.""" def ok(a, b): assert a == b, (a, b) ok(map(nbits_required, range(1, 10)), [0, 1, 2, 2, 3, 3, 3, 3, 4]) ok(nbits_required(131072), 17) ok(nbits_required(131073), 18) b = Bloom(1024, 5) assert 'asdf' not in b assert 'fdsa' not in b b.add('asdf') assert 'asdf' in b assert 'fdsa' not in b # false positives (depends on hash function): x = Bloom(8, 3) x.add('asdf') # about a 5% chance of false positives assert 'asdf' in x assert 'fdsa' not in x ok(filter(x.__contains__, ['foo%d' % ii for ii in range(25)]), ['foo22']) test_bloom() def misspellings(infile): """Demo: spell check.""" import re, cPickle, sys try: bf = cPickle.load(file('dict.pck', 'rb')) except IOError: # /usr/share/dict/words has 234936 words on this Mac and is 2.4 megs sys.stderr.write("reading dictionary...\n") words = file('/usr/share/dict/words') # 2^21 bits, 8.9 per word, would give us 1.5% false positives with 5 # hashes or 1.7% with 6, so we use 4194304 = 2^22 bits, or 17.8 per # word, for 0.09% false positives; that's still only half a mebibyte, # although pickle overhead pushes it up to 559K, 22% of the dictionary. bf = Bloom(4194304, 5) lineno = 0 for line in words: bf.add(line[:-1].lower()) lineno += 1 if not lineno % 100: sys.stderr.write('%s%s\r' % (lineno, ' ' * 40)) sys.stderr.flush() sys.stderr.write("done reading dictionary;\n") try: cPickle.dump(bf, file('dict.pck', 'wb'), 2) except: pass def candidates(word): """Words you might find in the dictionary in English.""" yield word for suffix in ['s', 'ing', 'ed', 'es', 'er', 's', 'ly']: if word.endswith(suffix): yield word[:-len(suffix)] for suffix, repl in [('ed', 'e'), ('er', 'e'), ('ing', 'e'), ('ies', 'y'), ('ied', 'y')]: if word.endswith(suffix): yield word[:-len(suffix)] + repl for line in infile: for word in re.findall(r"['\w]+", line): # we drop the "'" because our dictionary has "didnt" but not # "didn't" for chance in candidates(word.replace("'", '').lower()): if chance in bf: break else: sys.stdout.write(word + ' ') sys.stdout.flush() # to prevent repeating it... bf.add(word) sys.stdout.write('\n') if __name__ == '__main__': import sys misspellings(file(sys.argv[1]))