On Jun 3, 4:13 pm, bfrederi <brfrederi...@gmail.com> wrote: > On Jun 3, 3:59 pm, Chris Rebert <c...@rebertia.com> wrote: > > > > > On Thu, Jun 3, 2010 at 1:44 PM, bfrederi <brfrederi...@gmail.com> wrote: > > > I am using lxml iterparse and running into a very obscure error. When > > > I run iterparse on a file, it will occasionally return an element that > > > has a element.text == None when the element clearly has text in it. > > > > I copy and pasted the problem xml into a python string, used StringIO > > > to create a file-like object out of it, and ran a test using iterparse > > > with expected output, and it ran perfectly fine. So it only happens > > > when I try to run iterparse on the actual file. > > > > So then I tried opening the file, reading the data, turning that data > > > into a file-like object using StringIO, then running iterparse on it, > > > and the same problem (element.text == None) occurred. > > > > I even tried this: > > > f = codecs.open(abbyy_filename, 'r', encoding='utf-8') > > > file_data = f.read() > > > file_like_object = StringIO.StringIO(file_data) > > > for event, element in iterparse(file_like_object, events=("start", > > > "end")): > > > IIRC, XML parsers operate on bytes directly (since they have to > > determine the encoding themselves anyway), not pre-decoded Unicode > > characters, so I think your manual UTF-8 decoding could be the > > problem. > > Have you tried simply: > > > f = open(abbyy_filename, 'r') > > for event, element in iterparse(f, events=("start", "end")): > > #whatever > > > ? > > > Apologies if you already have, but since you didn't include the > > original, albeit probably trivial, error-causing code, this relatively > > simple error couldn't be ruled out. > > > Cheers, > > Chris > > --http://blog.rebertia.com > > Sorry for not mentioning it, but I tried that as well and it failed. > Here is the relevant class. AbbyyLine and Abbyyword just take the > element's text and writes it to a file/file-like object. parse_doc is > where I use iterparse. The relevant part is very minimal and there is > a lot of fluff to ignore, so I didn't initially post it: > > class AbbyyDocParse(object): > > """Takes an abbyy filename and parses the contents""" > def __init__(self, abbyy_filename, extension=DEFAULT_ABBYY_EXT, > format_list=OUTPUT_TYPES, string_only=False): > self.extension = extension > self.format_list = format_list > #Create the file handles for the output files > self.create_filehandles(abbyy_filename, string_only) > #Parse the document > self.parse_doc(abbyy_filename) > #Close the output filehandles > self.close_filehandles(abbyy_filename, string_only) > > def create_filehandles(self, abbyy_filename, string_only): > """Create output filehandles""" > #if output goes to a file > if not string_only: > #Make sure the file is an abbyy file > if not abbyy_filename.endswith(self.extension): > raise ParserException, "Bad abbyy filename given: %s" > \ > % (abbyy_filename) > #get the base path and filename for output files > filename = abbyy_filename.replace(self.extension, '') > #Loop through the different formats > for format_type in self.format_list: > #if output goes to a file > if not string_only: > #Create output filename > out_file = "%s%s" % (filename, > OUTPUT_EXTENSIONS.get(format_type)) > #Opens the format type filehandle > try: > setattr(self, "%s_handle" % (format_type), > open(out_file,'w')) > except: > raise IOError, "Could not open file: %s" % > (out_file) > #if output goes to a string > else: > #Opens the format type StringIO > try: > setattr(self, "%s_handle" % (format_type), > StringIO.StringIO()) > except: > raise IOError, "Could not open string output: %s" > % (out_file) > > def parse_doc(self, abbyy_filename): > """Parses the abbyy document""" > #Write the first line of the xml doc, if specified > if getattr(self, 'xml_handle', None): > self.xml_handle.write('<?xml version="1.0" > encoding="utf-8"?>\n') > #Memory efficient iterparse opens file and loops through > content > for event, element in iterparse(abbyy_filename, > events=("start", "end")): > #ignore the namespace, if it has one > if NAMESPACE_REGEX.search(element.tag, 0): > element_tag = NAMESPACE_REGEX.search(element.tag, > 0).group(1) > else: > element_tag = element.tag > #if this is the page element > if element_tag == 'page': > self.write_page(event, element) > #If at the beginning of the line > elif element_tag == 'line' and event == 'start': > #Create the line > line = AbbyyLine(element) > #Instantiate first word > word = AbbyyWord(line) > #If at the end of the line, and an output text file exists > if element_tag == 'line' and event == 'end' and \ > getattr(self, 'text_handle', None): > #output line data to text file > line.write_line(self.text_handle) > #If at the end of the line, and an output text file exists > if element_tag == 'line' and event == 'end' and \ > getattr(self, 'xml_handle', None): > #output line data to text file > word.write_word(self.xml_handle) > #if outputting to an xml file, create word data > if getattr(self, 'xml_handle', None) and \ > element_tag == 'charParams' and event == 'start': > #Insert character into word > word.insert_char(element, self.xml_handle) > #if outputting to a text file, create line data > if getattr(self, 'text_handle', None) and \ > element_tag == 'charParams' and event == 'start': > #Insert character into line > line.insert_char(element) > > def write_page(self, event, element): > """Parse the page contents""" > #page open tag event > if event == 'start': > #Write page info to xml file > if getattr(self, 'xml_handle', None): > #Get the page info > x_dim = element.get('width') > y_dim = element.get('height') > resolution = element.get('resolution') > #Write the page info to the file > self.xml_handle.write('<page>\n') > self.xml_handle.write('<filename/>\n') > self.xml_handle.write('<confidence/>\n') > self.xml_handle.write("<xDim>%s</xDim>\n" % (x_dim)) > self.xml_handle.write("<yDim>%s</yDim>\n" % (y_dim)) > self.xml_handle.write("<resolution>%s</resolution>\n" > % (resolution)) > self.xml_handle.write('<zone/>\n') > self.xml_handle.write('<wordsboundingboxes>\n') > #page close tag event > elif event == 'end': > #Write page info to xml file > if getattr(self, 'xml_handle', None): > #Write closing tags to file > self.xml_handle.write('</wordsboundingboxes>\n') > self.xml_handle.write('</page>') > > def write_line(self, event, element): > """Parse the line contents""" > #line open tag event > if event == 'start': > pass > #page close tag event > elif event == 'end': > pass > > def write_word(self, event, element): > """Parse the charParams contents""" > pass > > def close_filehandles(self, abbyy_filename, string_only): > """Close the open filehandles""" > #if the files exist > if not string_only: > #Loop through the different formats > for format_type in self.format_list: > #Opens the format type filehandle > try: > getattr(self, "%s_handle" % (format_type)).close() > except: > raise IOError, "Could not close format type: %s > for file: %s" \ > % (format_type, abbyy_filename)
I think this is a bug with iterparse. I switched to using regular parse for the parse_doc function, and it worked just fine: def parse_doc(self, abbyy_filename): """Parses the abbyy document""" #Write the first line of the xml doc, if specified if getattr(self, 'xml_handle', None): self.xml_handle.write('<?xml version="1.0" encoding="utf-8"?> \n') #Try to open the abbyy file try: f = open(abbyy_filename, "r") #abbyy_filename is already and instance of a file-like object except: #parse the abbyy file tree = parse(abbyy_filename) #parse the open abbyyfile else: tree = parse(f) f.close() root = tree.getroot() line = None for element in root.iter("*"): #ignore the namespace, if it has one if NAMESPACE_REGEX.search(element.tag, 0): element_tag = NAMESPACE_REGEX.search(element.tag, 0).group(1) else: element_tag = element.tag #if this is the page element if element_tag == 'page': self.write_page('start', element) #If at the beginning of the new line elif element_tag == 'line': #if a line already existed, and there is an output text file if line != None: if getattr(self, 'text_handle', None): #output line data to text file line.write_line(self.text_handle) elif getattr(self, 'xml_handle', None): #output line data to xml file word.write_word(self.xml_handle) #Create the line line = AbbyyLine(element) #Instantiate first word word = AbbyyWord(line) #if outputting to an xml file, create word data if getattr(self, 'xml_handle', None) and element_tag == 'charParams': #Insert character into word word.insert_char(element, self.xml_handle) #if outputting to a text file, create line data if getattr(self, 'text_handle', None) and element_tag == 'charParams': #Insert character into line line.insert_char(element) #if a line already existed, and there is an output text file if line != None: if getattr(self, 'text_handle', None): #output line data to text file line.write_line(self.text_handle) elif getattr(self, 'xml_handle', None): #output line data to xml file word.write_word(self.xml_handle) self.write_page('end', element) -- http://mail.python.org/mailman/listinfo/python-list