Thanks for the help. I converted everything into the StringIO() format. Memory is still getting chewed up. I will look at ElementTree later but for now I believe the speed issue must be related to the amount of memory that is getting used. It is causing all of windows to slow to a crawl. gc.collect() still reports the same quantity as before. Don't know what to try next. Updated program is below:
from xml.dom import minidom import os from cStringIO import StringIO class xmlProcessing: """ General class for XML processing""" def process(self, filename="", xmlString=""): if xmlString: pass elif filename: xmldoc = minidom.parse(filename) self.parse( xmldoc.documentElement ) def parseBranch(self, parentNode): """ Process an XML branch """ for node in parentNode.childNodes: try: parseMethod = getattr(self, "parse_%s" % node.__class__.__name__) except AttributeError: continue if parseMethod(node): continue self.parseBranch(node) del node def parse_Document(self, node): pass def parse_Text(self, node): pass def parse_Comment(self, node): pass def parse_Element(self, node): try: handlerMethod = getattr(self, "do_%s" % node.tagName) except AttributeError: return False handlerMethod(node) return True class reptorParsing(xmlProcessing): """ Specific class for generating a SQLalchemy program to create tables and populate them with data""" def __init__(self): self.schemaPreface = StringIO() self.schemaPreface.write("""from sqlalchemy import * from sqlalchemy.ext.declarative import declarative_base engine = create_engine('sqlite:///tutorial.db', echo=False) metadata = MetaData() Base = declarative_base()""") self.schemaTables = StringIO() self.schemaFields = StringIO() self.dataUpdate = StringIO() self.tableDict = {} self.tableName = StringIO() self.tables = StringIO() def parse(self, parentNode): """Main entry point to begin processing a XML document""" self.parseBranch(parentNode) # Properties such as schemaTables and .tables are populated by the various methods below fupdate=open(os.path.join(os.getcwd(), "update.py"), 'w') if self.schemaTables: fupdate.write("import schema\n") f=open(os.path.join(os.getcwd(), "schema.py"), 'w') f.write(self.schemaPreface+"\n"+self.schemaTables+ '\n' + "metadata.create_all(engine)\n"+ "print 'hello 2'") f.close() if self.tables: fupdate.write(self.tables) fupdate.close() def do_TABLES(self, tableNode): """Process schema for tables""" for node in tableNode.childNodes: self.tableName = node.tagName # Define a declaritive mapping class self.schemaTables.write("""\nclass %s(Base): __tablename__ = '%s' """ % (self.tableName, self.tableName)) self.schemaFields = StringIO() # allow for userA = users("Billy","Bob") via a __init__() self.schemaInitPreface = StringIO() self.schemaInitPreface.write(" def __init__(self") self.schemaInitBody = StringIO() self.parseBranch(node) self.schemaInitPreface.write("):\n") self.schemaTables.write(self.schemaFields.read() + "\n" + \ self.schemaInitPreface.read() + \ self.schemaInitBody.read() + "\n") def do_FIELDS(self, fieldsNode): """Process schema for fields within tables""" for node in fieldsNode.childNodes: if self.schemaFields: self.schemaFields.write("\n") cType = "" # The attribute type holds the type of field crType = node.attributes["type"].value if crType==u"C": cType = "String(length=%s)" % node.attributes["len"].value elif crType==u"N" and node.attributes["dec"].value==u'0': cType = "Integer" elif crType==u"N": cType = "Numeric(precision=%s, scale=%s)" % (node.attributes["len"].value,node.attributes["dec"].value) elif crType==u"L": cType = "Boolean" elif crType==u"T": cType = "DateTime" elif crType==u"D": cType = "Date" elif crType==u"M" or crType==u"G": cType = "Text" if node.attributes.getNamedItem("primary"): cType += ", primary_key=True" self.schemaFields.write(" %s = Column(%s)" % (node.tagName, cType)) self.schemaInitPreface.write(", \\\n %s" % (node.tagName)) self.schemaInitBody.write(" self.%s = %s\n" % (node.tagName, node.tagName)) self.tableDict[self.tableName + "." + node.tagName] = crType def do_DATA(self, dataNode): """This is for processing actual data to be pushed into the tables Layout is DATA -> TABLE_NAME key='primary_field' -> TUPLE -> FIELD_NAME -> VALUE""" for node in dataNode.childNodes: self.tableName = node.tagName self.dataUpdate=open(os.path.join(os.getcwd(), self.tableName + "_update.py"), 'w') self.dataUpdate.write(""" import time from datetime import * from sqlalchemy import * from sqlalchemy.orm import * engine = create_engine('sqlite:///tutorial.db', echo=False) Session = sessionmaker() Session.configure(bind=engine) session = Session() """) self.keyValue = "" self.keyField = node.attributes["key"].value self.parseBranch(node) self.tables.write("\nimport %s_update.py" % (self.tableName)) # f.write(self.dataUpdate) self.dataUpdate.close() def do_TUPLE(self, tupleNode): """ A TUPLE is what the XML file refers to a table row Sits below a DATA child""" self.dataUpdate.write(""" entry = %s() session.add(entry) """ % (self.tableName)) for node in tupleNode.childNodes: for dataNode in node.childNodes: crType = self.tableDict[self.tableName + "." + node.tagName] if crType==u"C" or crType==u"M": cValue = u'"""%s"""' % dataNode.data elif crType==u"T": cValue = 'datetime.strptime("'+dataNode.data+'", "%Y-%m-%d %H:%M")' elif crType==u"D": cValue = 'datetime.strptime("'+dataNode.data+'", "%Y-%m-%d")' else: cValue = dataNode.data self.dataUpdate.write(u"\nentry."+node.tagName+ u" = " + cValue) self.dataUpdate.write("\nsession.commit()") if __name__ == '__main__': replicate = reptorParsing() replicate.process(filename=os.path.join(os.getcwd(), "request.xml")) import update -- http://mail.python.org/mailman/listinfo/python-list