I occasionally run across the problem of matching one of a number of fixed strings in text. A very efficient way to do this is to build a deterministic finite automaton which will halt with success upon recognizing any of the strings, or halt with failure upon recognizing a string that is not a prefix of any of the strings.
One low-hassle way to do this is to build a regular expression containing a prefix tree of the fixed strings, and then use an existing regular expression engine to do the matching for you. Here's Python code to do that. #!/usr/bin/python -w import re, string class node: def __init__(self, mystr): self.tails = {} self.eos = 0 # can it be end of string? self.maxlen = 0 self.add_tail(mystr) def as_re(self): if self.maxlen == 1 and len(self.tails) != 1: # char class optimization; dunno if this really buys us much rv = [ '[' + string.join(self.tails.keys(), '') + ']' ] else: # normal case rv = [] for char, tail in self.tails.items(): rv.append(re.escape(char) + tail.as_re()) if len(rv) == 0: assert self.eos return '' if len(rv) == 1 and not self.eos: return rv[0] if self.maxlen == 1 and len(rv) == 1: stringrv = rv[0] else: # we need grouping parens if there are alternatives or # if the string is longer than one and self.eos stringrv = '(?:' + string.join(rv, '|') + ')' # this in itself doesn't buy us much, but in the case where there's # only one non-empty alternative with a maxlength of 1, it eliminates # the grouping parens, often transforming (?:s|) into s? or # (?:[sd]|) into [sd]?. if self.eos: stringrv = stringrv + '?' return stringrv def add_tail(self, mystr): if len(mystr) > self.maxlen: self.maxlen = len(mystr) if len(mystr) == 0: self.eos = 1 return if self.tails.has_key(mystr[0]): self.tails[mystr[0]].add_tail(mystr[1:]) else: self.tails[mystr[0]] = self.__class__(mystr[1:]) def prefixtree(strings): "Produce a deterministic re that will match exactly some set of strings." mynode = node(strings[0]) # will fail if strings is empty, but there's no RE to match the empty set of strings for mystr in strings[1:]: mynode.add_tail(mystr) return mynode.as_re() def prefixtree_fromfile(filename, maxlen): f = open(filename) w = f.readline() if w.endswith('\n'): w = w[:-1] n = node(w) ii = 0 while ii < maxlen: w = f.readline() if w.endswith('\n'): w = w[:-1] n.add_tail(w) ii += 1 return n.as_re()