Hi all, I thought it was something related to some "global pattern", a "regex", to configure within Trollius. But Victor Stinner (Trollius) told me it's not a Trollius matter, and suggested me to ask here for some advice.
I attached the file which, when run, produces the following output: time ./crawl_requests.py http://www.ilsole24ore.com/english-version/front-page.shtml?refresh_ce DEBUG:trollius:Using selector: EpollSelector ('url to do = ', ' http://www.ilsole24ore.com/english-version/front-page.shtml?refresh_ce') ('processing:', ' http://www.ilsole24ore.com/english-version/front-page.shtml?refresh_ce') ('...', ' http://www.ilsole24ore.com/english-version/front-page.shtml?refresh_ce', 'has error', '"global name \'s\' is not defined"') ('done:', 1, '; ok:', 0) Can you please give me some hints? Looking forward to your kind help. Kind regards. Marco
#!/usr/bin/python import logging import re import signal import sys #import asyncio import trollius as asyncio from trollius import From from trollius import Return import urllib import urlparse #import aiohttp import requests logging.basicConfig(level=logging.DEBUG) class Crawler: def __init__(self, rooturl, loop, maxtasks=4): self.rooturl = rooturl self.loop = loop self.todo = set() self.busy = set() self.done = {} self.tasks = set() self.sem = asyncio.Semaphore(maxtasks) # connector in aiohttp stores cookies between requests and uses connection pool #self.connector = aiohttp.TCPConnector(share_cookies=True, loop=loop) # http://docs.python-requests.org/en/latest/user/advanced/#advanced # The Session object allows you to persist certain parameters across requests. # It also persists cookies across all requests made from the Session instance. # A Session object has all the methods of the main Requests API. s = requests.Session() @asyncio.coroutine def run(self): asyncio.Task(self.addurls([(self.rooturl, '')])) # Set initial work. yield From(asyncio.sleep(1)) while self.busy: yield From(asyncio.sleep(1)) #self.connector.close() # TCP connection is closed self.loop.stop() # function which parses urls @asyncio.coroutine def addurls(self, urls): for url, parenturl in urls: #url = urllib.parse.urljoin(parenturl, url) #url, frag = urllib.parse.urldefrag(url) url = urlparse.urljoin(parenturl, url) #print "riga_54-url = ", url url, frag = urlparse.urldefrag(url) #print #print "riga_57-url = ", url if (url.startswith(self.rooturl) and url not in self.busy and url not in self.done and url not in self.todo): #if (url not in self.busy and url not in self.done and url not in self.todo): print('url to do = ', url) self.todo.add(url) yield From(self.sem.acquire()) task = asyncio.Task(self.process(url)) task.add_done_callback(lambda t: self.sem.release()) task.add_done_callback(self.tasks.remove) self.tasks.add(task) @asyncio.coroutine def process(self, url): print('processing:', url) # the url is removed from the 'todo' set and added to the 'busy' set self.todo.remove(url) self.busy.add(url) try: #resp = yield From(aiohttp.request( # 'get', url, connector=self.connector)) resp = yield From(s.get(url)) except Exception as exc: print('...', url, 'has error', repr(str(exc))) self.done[url] = False else: # if no exceptions occur #if (resp.status == 200 and # ('text/html' in resp.headers.get('content-type'))): # data = (yield From(resp.read()).decode('utf-8', 'replace')) if (s.status_code == 200 and ('text/html' in s.headers.get('content-type'))): data = yield From(s.read()).decode('utf-8', 'replace') # TO BE VERIFIED!!!!!! urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data) asyncio.Task(self.addurls([(u, url) for u in urls])) s.close() print("url = ", url) # if no exceptions occur, the url is added to the 'done' dictionary self.done[url] = True # after passing through 'try-except-else' flow, the url is removed from the 'busy' set self.busy.remove(url) #print(len(self.done), 'completed tasks,', len(self.tasks), # 'still pending, todo', len(self.todo)) #print("riga_85-self.done= ", self.done) raise Return(self.done) def main(): loop = asyncio.get_event_loop() #loop = asyncio.get_event_loop.set_debug() c = Crawler(sys.argv[1], loop) asyncio.Task(c.run()) try: loop.add_signal_handler(signal.SIGINT, loop.stop) except RuntimeError: pass #loop.run_forever() try: loop.run_until_complete(c.run()) #print('todo:', len(c.todo)) #print('busy:', len(c.busy)) print('done:', len(c.done), '; ok:', sum(c.done.values())) #print('tasks:', len(c.tasks)) finally: loop.close() print if __name__ == '__main__': if '--iocp' in sys.argv: from trollius import events, windows_events sys.argv.remove('--iocp') logging.info('using iocp') el = windows_events.ProactorEventLoop() events.set_event_loop(el) main()