On May 25, 6:51 am, Jia Lu <[EMAIL PROTECTED]> wrote: > Hi all > > I'm trying to parsing html with re module. > > html = """ > <TABLE BORDER=1 cellspacing=0 cellpadding=2> > <TR> > > <TH nowrap>DATA1</TH><TH nowrap>DATA2</HT><TH nowrap>DATA3</ > HT><TH>DATA4</TH> > </TR> > > <TR><TD>DATA5</TD><TD>DATA6</TD><TD>DATA7</TD><TD>DATA8</TD></TR> > > </TABLE> > """ > > I want to get DATA1-8 from that string.(DATA maybe not english words.) > Can anyone tell me how to do it with regular expression in python? > > Thank you very much.
# example1.py # This example will print out more than what's in the HTML table. It would also print # out text between <body></body> tags, and so on. import HTMLParser class DataParser(HTMLParser.HTMLParser): def handle_data (self, data): data = data.strip() if data: print data html = ''' <TABLE BORDER=1 cellspacing=0 cellpadding=2> <TR> <TH nowrap>DATA1</TH><TH nowrap>DATA2</HT><TH nowrap>DATA3</ HT><TH>DATA4</TH> </TR> <TR><TD>DATA5</TD><TD>DATA6</TD><TD>DATA7</TD><TD>DATA8</TD></TR> </TABLE> ''' parser = DataParser() parser.feed(html) parser.close() example1.py output: $ python example1.py DATA1 DATA2 DATA3 DATA4 DATA5 DATA6 DATA7 DATA8 # example2.py # This example uses the re module to pull out only the table portions of HTML. This # should only print out data between <table></table> tags. Notice that there is some # data between the <body></body> tags that is not present in the output. import HTMLParser import re class DataParser(HTMLParser.HTMLParser): def handle_data (self, data): data = data.strip() if data: print data html = ''' <html> <head></head> <body> body data 1 <table> <tr><td>table 1 data 1</td></tr> <tr><td>table 1 data 2</td></tr> </table> <table> <tr><td>table 2 data 1</td></tr> <tr><td>table 2 data 2</td></tr> </table> body data 2 </body> </html> ''' tables_list = re.findall('<table>.*?</table>', html, re.DOTALL | re.IGNORECASE) tables_html = str.join(' ', tables_list) parser = DataParser() parser.feed(tables_html) parser.close() example2.py output: $ python example2.py table 1 data 1 table 1 data 2 table 2 data 1 table 2 data 2 # example3.py # This example does basically the same thing as example2.py, but it uses HTMLParser # to keep track of whether the data is between <table></table> tags. import HTMLParser class DataParser(HTMLParser.HTMLParser): def __init__ (self): HTMLParser.HTMLParser.__init__(self) self.table_count = 0 def handle_starttag (self, tag, attrs): if tag == 'table': self.table_count += 1 def handle_endtag (self, tag): if tag == 'table': self.table_count -= 1 def handle_data (self, data): data = data.strip() if data and self.table_count > 0: print data html = ''' <html> <head></head> <body> body data 1 <table> <tr><td>table 1 data 1</td></tr> <tr><td>table 1 data 2</td></tr> </table> <table> <tr><td>table 2 data 1</td></tr> <tr><td>table 2 data 2</td></tr> </table> body data 2 </body> </html> ''' parser = DataParser() parser.feed(html) parser.close() example3.py output: $ python example3.py table 1 data 1 table 1 data 2 table 2 data 1 table 2 data 2 -- http://mail.python.org/mailman/listinfo/python-list