Hi, I wonder if there is a safe way to download page with urllib2. I've constructed following method to catch all possible exceptions.
def retrieve(url): user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent':user_agent} request = urllib2.Request(url, headers=headers) try: handler = urllib2.urlopen(request) data = handler.read() handler.close() except urllib2.HTTPError, e: log.warning("Server couldn't fulfill the request: %s, %s" % \ (url, e.code)) return None except urllib2.URLError, e: log.warning("Failed to reach a server: %s, %s" % (url, e.reason)) return None except HTTPException, e: log.warning("HTTP exception: %s, %s" % (url, e.__class__.__name__)) return None except socket.timeout: log.warning("Timeout expired: %s" % (url)) return None return data But suddenly I've got the following: Traceback (most recent call last): File "/usr/lib/python2.5/threading.py", line 486, in __bootstrap_inner self.run() File "/home/light/prj/ym-crawl/shops/dispatcher.py", line 122, in run self.task(self.queue, item) File "scrawler.py", line 24, in spider data = retrieve(url) File "scrawler.py", line 44, in retrieve data = handler.read() File "/usr/lib/python2.5/socket.py", line 291, in read data = self._sock.recv(recv_size) File "/usr/lib/python2.5/httplib.py", line 509, in read return self._read_chunked(amt) File "/usr/lib/python2.5/httplib.py", line 563, in _read_chunked value += self._safe_read(chunk_left) File "/usr/lib/python2.5/httplib.py", line 602, in _safe_read chunk = self.fp.read(min(amt, MAXAMOUNT)) File "/usr/lib/python2.5/socket.py", line 309, in read data = self._sock.recv(recv_size) error: (104, 'Connection reset by peer') What did I miss? I don't really want to catch all errors. Thanks! -- http://mail.python.org/mailman/listinfo/python-list