I'm tasked with writing a 'simple' ElementTree based parser with support for
unknown entities eg &foo;.
This code derived from FL's old documentation fails in both python 2 and 3.
########################
import xml.etree.ElementTree as ET
try:
ascii
except:
from future_builtins import ascii
class EchoTarget:
def start(self, tag, attrib):
print("start %s %s"%(tag, ascii(attrib)))
def end(self, tag):
print("end %s"%tag)
def data(self, data):
print("data %s" % ascii(data))
def close(self):
print("close")
def __getattr__(self,a):
print('target attempting to get attribute %s' % a)
target = EchoTarget()
parser = ET.XMLParser(target=target)
parser.entity['foo'] = b'AAAA&fum;BBBB'
parser.entity['fum'] = b'CCCC'
print("parser.entity=%s" % ascii(parser.entity))
parser.feed("<element>some text &foo;</element>")
parser.feed("")
parser.close()
########################
The entity value doesn't seem to get referenced.
I tried this derived from
http://stackoverflow.com/questions/7237466/python-elementtree-support-for-parsing-unknown-xml-entities
########################
__all__=tuple(filter(None,'''
Xml2TT
EntityMap
'''.split()))
import xml.etree.ElementTree as ET
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
class EntityMap(dict):
def __getitem__(self,key):
try:
r = dict.__getitem__(self,key)
except:
r = '&' + key +';'
return r
class Xml2TT:
'''
create a callable object that can turns xml into a tupletree
if mutable is set to True then it's really a list tree
'''
def __init__(self,mutable=False,entityMap=None):
self._mutable = mutable
self._parser = parser = ET.XMLParser()
parser.parser.UseForeignDTD(True)
parser.entity = self._entityMap = entityMap
def __call__(self,xml):
r = self._mtt(ET.ElementTree().parse(StringIO(xml.strip()),
parser=self._parser))
return r[0]
def _mtt(self,node):
t = [node.text] if node.text else []
e = t.extend
for c in node:
e(self._mtt(c))
t = (node.tag,node.attrib,t,None)
if self._mutable:
t = list(t)
return [t,node.tail] if node.tail else [t]
if __name__=='__main__':
print(repr(Xml2TT()('<a>aaaaa<b>bbbb<c ca="123"/>22</b></a>')))
print(repr(Xml2TT()('<a>aaaaa=&=bbbbb< ></a>')))
print(repr(Xml2TT(entityMap=EntityMap({'mu': '…','foo':
'AAA&fum;BBB','fum':'CCC'}))('<a>amp=& moo=&moo; lt=< gt=> mu=μ
foo=&foo;</a>')))
########################
and it sort of works in python2, fails in python3 with
AttributeError: 'xml.etree.ElementTree.XMLParser' object has no attribute
'parser'
Even in python 2 there's a subtle bug as the output is
('a', {}, ['aaaaa', ('b', {}, ['bbbb', ('c', {'ca': '123'}, [], None), '22'],
None)], None)
('a', {}, ['aaaaa=&=bbbbb< >'], None)
('a', {}, [u'amp=& moo=&moo; lt=< gt=> mu=… foo=AAA&fum;BBB'], None)
ie the result of the &foo; lookup is not re-parsed so &fum; is not translated.
Is there a way to get a simple ElementTree based parser that can do what I want?
I have several hundred entities and the size of the DTD would probably be larger
than 99% of the strings I need to parse. I think I can live with the
non-reparsing of the map output, but can I get Python 3 to do the UseForeignDTD
thing?
--
Robin Becker
--
https://mail.python.org/mailman/listinfo/python-list