Attached is a essence of my crawler. This collects <a> tag in a given URL
HTML parsing is not a big deal as "tidy" does all for you. It converts
a broken HTML
to a valid XHTML. From that point there're wealth of XML libraries. Just write
whatever you want such as <a> element handler.
I've extended it for multi-thread, limit the number of thread for a
specific web host,
more flexible element handling, etc, etc. SQLite is nice for making URL db
by the way.
Kenji Noguchi
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, urllib, urllib2, cookielib
import xml.dom.minidom, tidy
from urlparse import urlparse, urljoin
_ua = "Mozilla/5.0 (Windows; U; Windows NT 6.0; ja; rv:1.8.1.12) Gecko/20080201 Firefox/2.0.0.12"
# I'm not sure if CookieJar() is thread safe
cj = cookielib.CookieJar()
class SingleCrawler:
def __init__(self, seed_url=None):
self.seed_url = seed_url
self.urls = {}
# static
def _convert(self, html):
if isinstance(html, unicode):
html = html.encode('utf-8')
options = dict(
doctype='strict',
drop_proprietary_attributes=True,
enclose_text=True,
output_xhtml=True,
wrap=0,
char_encoding='utf8',
newline='LF',
tidy_mark=False,
)
return str(tidy.parseString(html, **options))
def _collect_urls(self, node, nest=0):
if node.nodeType == 1 and node.nodeName == 'a':
href = node.getAttribute('href')
if not href.startswith('#'):
p = urlparse(href)
if p.scheme in ('', 'http', 'https'):
self.urls[node.getAttribute('href')] = True
else:
# mailto, javascript
print p.scheme
for i in node.childNodes:
self._collect_urls(i, nest+1)
def canonicalize(self):
d = {}
for url in self.urls:
d[urljoin(self.seed_url, url).encode('ascii')] = True
self.urls = d
def crawl(self):
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent', _ua)]
try:
html = opener.open(self.seed_url).read()
except urllib2.HTTPError, e:
return None
except urllib2.URLError, e:
print "URL Error:", self.seed_url
return None
if html.startswith('<?xml'):
# destroy xhtml ;-)
html = html[html.index('?>')+2:]
html = self._convert(html)
try:
dom = xml.dom.minidom.parseString(html)
except ExpatError, e:
print "ExpatError:", html
return None
self._collect_urls(dom.childNodes[1])
self.canonicalize()
return self.urls.keys()
if __name__=='__main__':
crawler = SingleCrawler()
crawler.seed_url = 'http://www.python.org'
next_urls = crawler.crawl()
print next_urls
--
http://mail.python.org/mailman/listinfo/python-list