Hi,

I wonder if there is a safe way to download page with urllib2. I've
constructed following method to catch all possible exceptions.

def retrieve(url):
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent':user_agent}
    request = urllib2.Request(url, headers=headers)
    try:
        handler = urllib2.urlopen(request)
        data = handler.read()
        handler.close()
    except urllib2.HTTPError, e:
        log.warning("Server couldn't fulfill the request: %s, %s" % \
            (url, e.code))
        return None
    except urllib2.URLError, e:
        log.warning("Failed to reach a server: %s, %s" % (url,
e.reason))
        return None
    except HTTPException, e:
        log.warning("HTTP exception: %s, %s" % (url,
e.__class__.__name__))
        return None
    except socket.timeout:
        log.warning("Timeout expired: %s" % (url))
        return None
    return data


But suddenly I've got the following:

Traceback (most recent call last):
  File "/usr/lib/python2.5/threading.py", line 486, in
__bootstrap_inner
    self.run()
  File "/home/light/prj/ym-crawl/shops/dispatcher.py", line 122, in
run
    self.task(self.queue, item)
  File "scrawler.py", line 24, in spider
    data = retrieve(url)
  File "scrawler.py", line 44, in retrieve
    data = handler.read()
  File "/usr/lib/python2.5/socket.py", line 291, in read
    data = self._sock.recv(recv_size)
  File "/usr/lib/python2.5/httplib.py", line 509, in read
    return self._read_chunked(amt)
  File "/usr/lib/python2.5/httplib.py", line 563, in _read_chunked
    value += self._safe_read(chunk_left)
  File "/usr/lib/python2.5/httplib.py", line 602, in _safe_read
    chunk = self.fp.read(min(amt, MAXAMOUNT))
  File "/usr/lib/python2.5/socket.py", line 309, in read
    data = self._sock.recv(recv_size)
error: (104, 'Connection reset by peer')

What did I miss? I don't really want to catch all errors. Thanks!
--
http://mail.python.org/mailman/listinfo/python-list

Reply via email to