I occasionally run across the problem of matching one of a number of
fixed strings in text.  A very efficient way to do this is to build a
deterministic finite automaton which will halt with success upon
recognizing any of the strings, or halt with failure upon recognizing
a string that is not a prefix of any of the strings.

One low-hassle way to do this is to build a regular expression
containing a prefix tree of the fixed strings, and then use an
existing regular expression engine to do the matching for you.  Here's
Python code to do that.

#!/usr/bin/python -w

import re, string

class node:
    def __init__(self, mystr):
        self.tails = {}
        self.eos = 0  # can it be end of string?
        self.maxlen = 0
        self.add_tail(mystr)
    def as_re(self):
        if self.maxlen == 1 and len(self.tails) != 1:
            # char class optimization; dunno if this really buys us much
            rv = [ '[' + string.join(self.tails.keys(), '') + ']' ]
        else:
            # normal case
            rv = []
            for char, tail in self.tails.items():
                rv.append(re.escape(char) + tail.as_re())
        if len(rv) == 0:
            assert self.eos
            return ''
        if len(rv) == 1 and not self.eos:
            return rv[0]
        if self.maxlen == 1 and len(rv) == 1:
            stringrv = rv[0]
        else:
            # we need grouping parens if there are alternatives or
            # if the string is longer than one and self.eos
            stringrv = '(?:' + string.join(rv, '|') + ')'
        # this in itself doesn't buy us much, but in the case where there's
        # only one non-empty alternative with a maxlength of 1, it eliminates
        # the grouping parens, often transforming (?:s|) into s? or
        # (?:[sd]|) into [sd]?.
        if self.eos: stringrv = stringrv + '?'
        return stringrv
    def add_tail(self, mystr):
        if len(mystr) > self.maxlen:
            self.maxlen = len(mystr)
        if len(mystr) == 0:
            self.eos = 1
            return
        if self.tails.has_key(mystr[0]):
            self.tails[mystr[0]].add_tail(mystr[1:])
        else:
            self.tails[mystr[0]] = self.__class__(mystr[1:])

def prefixtree(strings):
    "Produce a deterministic re that will match exactly some set of strings."

    mynode = node(strings[0])  # will fail if strings is empty, but there's no RE to 
match the empty set of strings
    for mystr in strings[1:]:
        mynode.add_tail(mystr)
    return mynode.as_re()

def prefixtree_fromfile(filename, maxlen):
    f = open(filename)
    w = f.readline()
    if w.endswith('\n'): w = w[:-1]
    n = node(w)
    ii = 0
    while ii < maxlen:
        w = f.readline()
        if w.endswith('\n'): w = w[:-1]
        n.add_tail(w)
        ii += 1
    return n.as_re()

Reply via email to