Hi all,
I thought it was something related to some "global pattern", a "regex", to
configure within Trollius.
But Victor Stinner (Trollius) told me it's not a Trollius matter, and
suggested me to ask here for some advice.
I attached the file which, when run, produces the following output:
time ./crawl_requests.py
http://www.ilsole24ore.com/english-version/front-page.shtml?refresh_ce
DEBUG:trollius:Using selector: EpollSelector ('url to do = ', '
http://www.ilsole24ore.com/english-version/front-page.shtml?refresh_ce')
('processing:', '
http://www.ilsole24ore.com/english-version/front-page.shtml?refresh_ce')
('...', '
http://www.ilsole24ore.com/english-version/front-page.shtml?refresh_ce',
'has error', '"global name \'s\' is not defined"') ('done:', 1, '; ok:', 0)
Can you please give me some hints?
Looking forward to your kind help. Kind regards. Marco
#!/usr/bin/python
import logging
import re
import signal
import sys
#import asyncio
import trollius as asyncio
from trollius import From
from trollius import Return
import urllib
import urlparse
#import aiohttp
import requests
logging.basicConfig(level=logging.DEBUG)
class Crawler:
def __init__(self, rooturl, loop, maxtasks=4):
self.rooturl = rooturl
self.loop = loop
self.todo = set()
self.busy = set()
self.done = {}
self.tasks = set()
self.sem = asyncio.Semaphore(maxtasks)
# connector in aiohttp stores cookies between requests and uses connection pool
#self.connector = aiohttp.TCPConnector(share_cookies=True, loop=loop)
# http://docs.python-requests.org/en/latest/user/advanced/#advanced
# The Session object allows you to persist certain parameters across requests.
# It also persists cookies across all requests made from the Session instance.
# A Session object has all the methods of the main Requests API.
s = requests.Session()
@asyncio.coroutine
def run(self):
asyncio.Task(self.addurls([(self.rooturl, '')])) # Set initial work.
yield From(asyncio.sleep(1))
while self.busy:
yield From(asyncio.sleep(1))
#self.connector.close() # TCP connection is closed
self.loop.stop()
# function which parses urls
@asyncio.coroutine
def addurls(self, urls):
for url, parenturl in urls:
#url = urllib.parse.urljoin(parenturl, url)
#url, frag = urllib.parse.urldefrag(url)
url = urlparse.urljoin(parenturl, url)
#print "riga_54-url = ", url
url, frag = urlparse.urldefrag(url)
#print
#print "riga_57-url = ", url
if (url.startswith(self.rooturl) and
url not in self.busy and
url not in self.done and
url not in self.todo):
#if (url not in self.busy and url not in self.done and url not in self.todo):
print('url to do = ', url)
self.todo.add(url)
yield From(self.sem.acquire())
task = asyncio.Task(self.process(url))
task.add_done_callback(lambda t: self.sem.release())
task.add_done_callback(self.tasks.remove)
self.tasks.add(task)
@asyncio.coroutine
def process(self, url):
print('processing:', url)
# the url is removed from the 'todo' set and added to the 'busy' set
self.todo.remove(url)
self.busy.add(url)
try:
#resp = yield From(aiohttp.request(
#'get', url, connector=self.connector))
resp = yield From(s.get(url))
except Exception as exc:
print('...', url, 'has error', repr(str(exc)))
self.done[url] = False
else: # if no exceptions occur
#if (resp.status == 200 and
#('text/html' in resp.headers.get('content-type'))):
#data = (yield From(resp.read()).decode('utf-8', 'replace'))
if (s.status_code == 200 and ('text/html' in s.headers.get('content-type'))):
data = yield From(s.read()).decode('utf-8', 'replace') # TO BE VERIFIED!!
urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
asyncio.Task(self.addurls([(u, url) for u in urls]))
s.close()
print("url = ", url)
# if no exceptions occur, the url is added to the 'done' dictionary
self.done[url] = True
# after passing through 'try-except-else' flow, the url is removed from the 'busy' set
self.busy.remove(url)
#print(len(self.done), 'completed tasks,', len(self.tasks),
# 'still pending, todo', len(self.todo))
#print("riga_85-self.done= ", self.done)
raise Return(self.done)
def main():
loop = asyncio.get_event_loop()
#loop = asyncio.get_event_loop.set_debug()
c = Crawler(sys.argv[1], loop)
a