Hello!
Currently I'm trying to write small xmlrpc server for html data processing.
Processing is done by html tidy lib, but the problem is that it has massive
memory leak.
As processing is blocking operation I'm running it in thread, but after some
time and huge html document processing daemon eats all memory.
I wondering if its possible to load utidylib in thread, do processing and after
this kill thread and release memory? Or maybe something like deferToProcess?
Thanks in advance!
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import utidylib
from twisted.internet import epollreactor
epollreactor.install()
from twisted.internet import protocol, defer, threads, reactor
from twisted.web import xmlrpc, server
from twisted.python import log, threadpool
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
log.startLogging(sys.stdout)
import codecs
import gc
gc.enable()
gc.set_debug(gc.DEBUG_LEAK)
gc.set_threshold(1)
class TidyProtocol(xmlrpc.XMLRPC):
def xmlrpc_tidify(self, data):
defered = threads.deferToThread(self.tidyParse, data)
defered.addCallback(self.returnToClient)
return defered
def tidyParse(self, data):
options = {
'drop-proprietary-attributes': '1',
'output-xhtml': '1',
'wrap': '0',
'bare': '0',
'clean': '1',
'doctype': 'omit',
'show-body-only': '1',
'word-2000': '0',
'escape-cdata': '0',
'hide-comments': '1',
'force-output': '1',
'alt-text': '',
'show-errors': '0',
'show-warnings': '0',
'tidy-mark': '0',
'char-encoding': 'utf8',
}
if data['html'] == None:
return None
else:
htmldata = data['html'].encode()
print "Tidy start"
return tidy.parseString(htmldata, **options)
def returnToClient(self, data):
gc.collect()
print "Tidy end, retunring result"
return data
if __name__ == '__main__':
r = TidyProtocol()
reactor.listenTCP(1100, server.Site(r))
reactor.run()
_______________________________________________
Twisted-Python mailing list
[email protected]
http://twistedmatrix.com/cgi-bin/mailman/listinfo/twisted-python