Hi,

I have the following codes:

from __future__ import nested_scopes
import re
from UserDict import UserDict


class Replacer(UserDict):
"""
An all-in-one multiple string substitution class. This class was contributed by Xavier
Defrang to the ASPN Python Cookbook (http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/81330)
and [EMAIL PROTECTED]


Copyright: The methods _make_regex(), __call__() and substitute() were the work of Xavier Defrang,
__init__() was the work of [EMAIL PROTECTED], all others were the work of Maurice Ling"""


def __init__(self, dict = None, file = None):
"""Constructor. It calls for the compilation of regular expressions from either
a dictionary object or a replacement rule file.


@param dict: dictionary object containing replacement rules with the string to be
replaced as keys.
@param file: file name of replacement rule file
"""
self.re = None
self.regex = None
if file == None:
UserDict.__init__(self, dict)
self._make_regex()
else:
UserDict.__init__(self, self.readDictionaryFile(file))
self._make_regex()


def cleanDictionaryFile(self, file):
"""
Method to clean up the replacement rule dictionary file and write the cleaned
file as the same name as the original file."""
import os
dict = self.readDictionaryFile(file)
f = open(file, 'w')
for key in dict.keys(): f.write(str(key) + '=' + str(dict[key]) + os.linesep)
f.close()


def readDictionaryFile(self, file):
"""
Method to parse a replacement rule file (file) into a dictionary for regular
expression processing. Each rule in the rule file is in the form:
<string to be replaced>=<string to replace with>
"""
import string
import os
f = open(file, 'r')
data = f.readlines()
f.close()
dict = {}
for rule in data:
rule = rule.split('=')
if rule[1][-1] == os.linesep: rule[1] = rule[1][:-1]
dict[str(rule[0])] = str(rule[1])
print '%s replacement rule(s) read from %s' % (str(len(dict.keys())), str(file))
return dict


def _make_regex(self):
""" Build a regular expression object based on the keys of the current dictionary """
self.re = "(%s)" % "|".join(map(re.escape, self.keys()))
self.regex = re.compile(self.re)


    def __call__(self, mo):
        """ This handler will be invoked for each regex match """
            # Count substitutions
        self.count += 1 # Look-up string
        return self[mo.string[mo.start():mo.end()]]

    def substitute(self, text):
        """ Translate text, returns the modified text. """
        # Reset substitution counter
        self.count = 0
        # Process text
        #return self._make_regex().sub(self, text)
        return self.regex.sub(self, text)

def rmBracketDuplicate(self, text):
"""Removes the bracketed text in occurrences of '<text-x> (<text-x>)'"""
regex = re.compile(r'(\w+)\s*(\(\1\))')
return regex.sub(r'\1', text)


def substituteMultiple(self, text):
"""Similar to substitute() method except that this method loops round the same text
multiple times until no more substitutions can be made or when it had looped
10 times. This is to pre-ampt for cases of recursive abbreviations."""
count = 1 # to get into the loop
run = 0 # counter for number of runs thru the text
while count > 0 and run < 10:
count = 0
text = self.rmBracketDuplicate(self.substitute(text))
count = count + self.count
run = run + 1
print "Pass %d: Changed %d things(s)" % (run, count)
return text





Normally I will use the following to instantiate my module:

replace = Replacer('', 'rule.mdf')

rule.mdf is in the format of "<string to be replaced>=<string to replace with>\n"

Then using replace.substituteMultiple('<my text>') to carry out multiple replacements.

It all works well for rule count up to 800+ but when my replacement rules swells up to 1800+, it gives me a runtime error that says "Internal error in regular expression engine"... traceable to "return self.regex.sub(self, text)" in substitute() method.


Any ideas or workarounds?

Thanks in advance.

Cheers,
Maurice
--
http://mail.python.org/mailman/listinfo/python-list

Reply via email to