On May 22, 2:40 am, alex23 <[EMAIL PROTECTED]> wrote: > On May 22, 8:18 am, [EMAIL PROTECTED] wrote: > > > Sorry, im new to both python and newsgroups, this is all pretty > > confusing. So I need a line in my __init__ function of my class? The > > spider class I made inherits from HTMLParser. Its just using the > > feed() function that produces errors though, the rest seems to work > > fine. > > Let me repeat: it would make this a lot easier if you would paste > actual code. > > As you say, your Spider class inherits from HTMLParser, so you need to > make sure that you set it up correctly so that the HTMLParser > functionality you've inherited will work correctly (or work as you > want it to work). If you've added your own __init__ to Spider, then > the __init__ on HTMLParser is no longer called unless you *explicitly* > call it yourself. > > Unfortunately, my earlier advice wasn't totally correct... HTMLParser > is an old-style object, whereas super() only works for new-style > objects, I believe. (If you don't know about old- v new-style objects, > seehttp://docs.python.org/ref/node33.html). So there are a couple of > approaches that should work for you: > > class SpiderBroken(HTMLParser): > def __init__(self): > pass # don't do any ancestral setup > > class SpiderOldStyle(HTMLParser): > def __init__(self): > HTMLParser.__init__(self) > > class SpiderNewStyle(HTMLParser, object): > def __init__(self): > super(SpiderNewStyle, self).__init__() > > Python 2.5.1 (r251:54863, May 1 2007, 17:47:05) [MSC v.1310 32 bit > (Intel)] on win32 > Type "help", "copyright", "credits" or "license" for more information.>>> > html = open('temp.html','r').read() > >>> from spider import * > >>> sb = SpiderBroken() > >>> sb.feed(html) > > Traceback (most recent call last): > File "<stdin>", line 1, in <module> > File "C:\Python25\lib\HTMLParser.py", line 107, in feed > self.rawdata = self.rawdata + data > AttributeError: SpiderBroken instance has no attribute 'rawdata' > > >>> so = SpiderOldStyle() > >>> so.feed(html) > >>> sn = SpiderNewStyle() > >>> sn.feed(html) > > The old-style version is probably easiest, so putting this line in > your __init__ should fix your issue: > > HTMLParser.__init__(self) > > If this still isn't clear, please let me know. > > - alex23
OK, heres what I have so far: #!/usr/bin/env python from HTMLParser import HTMLParser from urllib2 import urlopen, HTTPError class Spider(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.found = [] self.queue = [] def handle_starttag(self, tag, attrs): try: if tag == 'a': if attrs[0][0] == 'href': self.queue.append(attrs[0][1]) except HTMLParseError: print 'Error parsing HTML tags' def parse(self, page): try: self.feed(urlopen('http://' + page).read()) except HTTPError: print 'Error getting page source' def crawl(self, site): self.queue.append(site) while 1: try: url = self.queue.pop(0) self.parse(url) except IndexError: break self.found.append(url) return self.found if __name__ == '__main__': s = Spider() site = raw_input("What site would you like to scan? http://") s.crawl(site) Still getting very odd errors though, this being the latest: Traceback (most recent call last): File "spider.py", line 38, in <module> s.crawl(site) File "spider.py", line 30, in crawl self.parse(url) File "spider.py", line 21, in parse self.feed(urlopen('http://' + page).read()) File "/Library/Frameworks/Python.framework/Versions/2.5/lib/ python2.5/urllib2.py", line 124, in urlopen return _opener.open(url, data) File "/Library/Frameworks/Python.framework/Versions/2.5/lib/ python2.5/urllib2.py", line 381, in open response = self._open(req, data) File "/Library/Frameworks/Python.framework/Versions/2.5/lib/ python2.5/urllib2.py", line 399, in _open '_open', req) File "/Library/Frameworks/Python.framework/Versions/2.5/lib/ python2.5/urllib2.py", line 360, in _call_chain result = func(*args) File "/Library/Frameworks/Python.framework/Versions/2.5/lib/ python2.5/urllib2.py", line 1107, in http_open return self.do_open(httplib.HTTPConnection, req) File "/Library/Frameworks/Python.framework/Versions/2.5/lib/ python2.5/urllib2.py", line 1064, in do_open h = http_class(host) # will parse host:port File "/Library/Frameworks/Python.framework/Versions/2.5/lib/ python2.5/httplib.py", line 639, in __init__ self._set_hostport(host, port) File "/Library/Frameworks/Python.framework/Versions/2.5/lib/ python2.5/httplib.py", line 651, in _set_hostport raise InvalidURL("nonnumeric port: '%s'" % host[i+1:]) httplib.InvalidURL: nonnumeric port: '' Also could you explain why I needed to add that HTMLParser.__init__(self) line? Does it matter that I have overwritten the __init__ function of spider? Thanks -- http://mail.python.org/mailman/listinfo/python-list