I'm tasked with writing a 'simple' ElementTree based parser with support for unknown entities eg &foo;.

This code derived from FL's old documentation fails in both python 2 and 3.
########################
import xml.etree.ElementTree as ET
try:
    ascii
except:
    from future_builtins import ascii

class EchoTarget:
    def start(self, tag, attrib):
        print("start %s %s"%(tag, ascii(attrib)))
    def end(self, tag):
        print("end %s"%tag)
    def data(self, data):
        print("data %s" % ascii(data))
    def close(self):
        print("close")

    def __getattr__(self,a):
        print('target attempting to get attribute %s' % a)

target = EchoTarget()
parser = ET.XMLParser(target=target)
parser.entity['foo'] = b'AAAA&fum;BBBB'
parser.entity['fum'] = b'CCCC'
print("parser.entity=%s" % ascii(parser.entity))
parser.feed("<element>some text &foo;</element>")
parser.feed("")
parser.close()
########################

The entity value doesn't seem to get referenced.



I tried this derived from
http://stackoverflow.com/questions/7237466/python-elementtree-support-for-parsing-unknown-xml-entities

########################
__all__=tuple(filter(None,'''
        Xml2TT
        EntityMap
        '''.split()))
import xml.etree.ElementTree as ET
try:
    from StringIO import StringIO
except ImportError:
    from io import StringIO

class EntityMap(dict):
    def __getitem__(self,key):
        try:
            r = dict.__getitem__(self,key)
        except:
            r = '&amp;' + key +';'
        return r

class Xml2TT:
    '''
    create a callable object that can turns xml into a tupletree
    if mutable is set to True then it's really a list tree
    '''
    def __init__(self,mutable=False,entityMap=None):
        self._mutable = mutable
        self._parser = parser = ET.XMLParser()
        parser.parser.UseForeignDTD(True)
        parser.entity = self._entityMap = entityMap

    def __call__(self,xml):
r = self._mtt(ET.ElementTree().parse(StringIO(xml.strip()), parser=self._parser))
        return r[0]

    def _mtt(self,node):
        t = [node.text] if node.text else []
        e = t.extend
        for c in node:
            e(self._mtt(c))
        t = (node.tag,node.attrib,t,None)
        if self._mutable:
            t = list(t)
        return [t,node.tail] if node.tail else [t]

if __name__=='__main__':
    print(repr(Xml2TT()('<a>aaaaa<b>bbbb<c ca="123"/>22</b></a>')))
    print(repr(Xml2TT()('<a>aaaaa=&amp;=bbbbb&lt; &gt;</a>')))
print(repr(Xml2TT(entityMap=EntityMap({'mu': '&#x85;','foo': 'AAA&fum;BBB','fum':'CCC'}))('<a>amp=&amp; moo=&moo; lt=&lt; gt=&gt; mu=&mu; foo=&foo;</a>')))
########################

and it sort of works in python2, fails in python3 with

AttributeError: 'xml.etree.ElementTree.XMLParser' object has no attribute
'parser'

Even in python 2 there's a subtle bug as the output is

('a', {}, ['aaaaa', ('b', {}, ['bbbb', ('c', {'ca': '123'}, [], None), '22'], None)], None)
('a', {}, ['aaaaa=&=bbbbb< >'], None)
('a', {}, [u'amp=& moo=&amp;moo; lt=< gt=> mu=&#x85; foo=AAA&fum;BBB'], None)

ie the result of the &foo; lookup is not re-parsed so &fum; is not translated.

Is there a way to get a simple ElementTree based parser that can do what I want? I have several hundred entities and the size of the DTD would probably be larger than 99% of the strings I need to parse. I think I can live with the non-reparsing of the map output, but can I get Python 3 to do the UseForeignDTD thing?
--
Robin Becker

--
https://mail.python.org/mailman/listinfo/python-list

Reply via email to