Re: Searching a string and extract all occurancies of a substring

Nico Grubert Thu, 31 Aug 2006 09:27:26 -0700

> Try Beautiful Soup, or if your input is simple enough, the re module.


Hi Gabriel,

I first tried "HTMLParser" and wrote this short script:

from HTMLParser import HTMLParser
from htmlentitydefs import entitydefs

class MyDocParser(HTMLParser):

     def __init__(self):
         self.paths = []
         self.readingpaths = 0     # flag
         HTMLParser.__init__(self)

     def handle_starttag(self, tag, attrs):
         if tag == 'parameter':
             self.readingpaths = 1

     def handle_endtag(self, tag):
         if tag == 'parameter':
             self.readingpaths = 0

     def handle_data(self, data):
         if self.readingpaths:
             self.paths.append(data)

     def handle_entityref(self, name):
         " handle values like 'Home &amp; Products' "
         if entitydefs.has_key(name):
             self.handle_data(entitydefs[name])
         else:
             self.handle_data('&' + name + ';')

     def handle_charref(self, name):
         """ handle values like 'Home &amp; Products&#174;'
             Ignores invalid character references
         """
         try:
             charnum = int(name)
         except ValueError:
             return

         if charnum < 1 or charnum > 255:
             return

     def get_paths(self):
         return self.paths


def parse_content(content):
     """ parse
     """

     parser = MyDocParser()
     parser.feed(content)

     paths = parser.get_paths()

     return paths

# /end

This works as long as there are no other <paramter> Tags in the content 
that I parse.


Nico
-- 
http://mail.python.org/mailman/listinfo/python-list

Re: Searching a string and extract all occurancies of a substring

Reply via email to