Hi all,

I thought it was something related to some "global pattern", a "regex", to 
configure within Trollius.
But Victor Stinner (Trollius) told me it's not a Trollius matter, and 
suggested me to ask here for some advice.

I attached the file which, when run, produces the following output:

time ./crawl_requests.py 
http://www.ilsole24ore.com/english-version/front-page.shtml?refresh_ce 
DEBUG:trollius:Using selector: EpollSelector ('url to do = ', '
http://www.ilsole24ore.com/english-version/front-page.shtml?refresh_ce') 
('processing:', '
http://www.ilsole24ore.com/english-version/front-page.shtml?refresh_ce') 
('...', '
http://www.ilsole24ore.com/english-version/front-page.shtml?refresh_ce', 
'has error', '"global name \'s\' is not defined"') ('done:', 1, '; ok:', 0)

Can you please give me some hints?

Looking forward to your kind help. Kind regards. Marco
#!/usr/bin/python

import logging
import re
import signal
import sys
#import asyncio
import trollius as asyncio
from trollius import From
from trollius import Return
import urllib
import urlparse
#import aiohttp
import requests

logging.basicConfig(level=logging.DEBUG)
class Crawler:

    def __init__(self, rooturl, loop, maxtasks=4):
        self.rooturl = rooturl
        self.loop = loop
        self.todo = set()
        self.busy = set()
        self.done = {}
        self.tasks = set()
        self.sem = asyncio.Semaphore(maxtasks)

        # connector in aiohttp stores cookies between requests and uses connection pool
        #self.connector = aiohttp.TCPConnector(share_cookies=True, loop=loop)

        # http://docs.python-requests.org/en/latest/user/advanced/#advanced
        # The Session object allows you to persist certain parameters across requests. 
        # It also persists cookies across all requests made from the Session instance.
        # A Session object has all the methods of the main Requests API.
        s = requests.Session()

    @asyncio.coroutine
    def run(self):
        asyncio.Task(self.addurls([(self.rooturl, '')]))  # Set initial work.
        yield From(asyncio.sleep(1))
        while self.busy:
            yield From(asyncio.sleep(1))

        #self.connector.close()  # TCP connection is closed
        self.loop.stop()

    # function which parses urls
    @asyncio.coroutine
    def addurls(self, urls):
        for url, parenturl in urls:
            #url = urllib.parse.urljoin(parenturl, url)
            #url, frag = urllib.parse.urldefrag(url)
            url = urlparse.urljoin(parenturl, url)
            #print "riga_54-url = ", url
            url, frag = urlparse.urldefrag(url)
            #print
            #print "riga_57-url = ", url

            if (url.startswith(self.rooturl) and
                    url not in self.busy and
                    url not in self.done and
                    url not in self.todo):
            #if (url not in self.busy and url not in self.done and url not in self.todo):
                print('url to do = ', url)
                self.todo.add(url)
                yield From(self.sem.acquire())
                task = asyncio.Task(self.process(url))
                task.add_done_callback(lambda t: self.sem.release())
                task.add_done_callback(self.tasks.remove)
                self.tasks.add(task)


    @asyncio.coroutine
    def process(self, url):
        print('processing:', url)
        # the url is removed from the 'todo' set and added to the 'busy' set
        self.todo.remove(url)
        self.busy.add(url)
        try:
            #resp = yield From(aiohttp.request(
            #    'get', url, connector=self.connector))
            resp = yield From(s.get(url))

        except Exception as exc:
            print('...', url, 'has error', repr(str(exc)))
            self.done[url] = False
        else:  # if no exceptions occur
            #if (resp.status == 200 and
            #        ('text/html' in resp.headers.get('content-type'))):
            #    data = (yield From(resp.read()).decode('utf-8', 'replace'))
            if (s.status_code == 200 and ('text/html' in s.headers.get('content-type'))):
                data = yield From(s.read()).decode('utf-8', 'replace')  # TO BE VERIFIED!!!!!! 
                urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
                asyncio.Task(self.addurls([(u, url) for u in urls]))

            s.close()
            print("url = ", url)
            # if no exceptions occur, the url is added to the 'done' dictionary
            self.done[url] = True
        # after passing through 'try-except-else' flow, the url is removed from the 'busy' set
        self.busy.remove(url)
        #print(len(self.done), 'completed tasks,', len(self.tasks),
        #      'still pending, todo', len(self.todo))
        #print("riga_85-self.done= ", self.done)
        raise Return(self.done)


def main():
    loop = asyncio.get_event_loop()
    #loop = asyncio.get_event_loop.set_debug()
    c = Crawler(sys.argv[1], loop)
    asyncio.Task(c.run())

    try:
        loop.add_signal_handler(signal.SIGINT, loop.stop)
    except RuntimeError:
        pass
    #loop.run_forever()
    try:
        loop.run_until_complete(c.run())
        #print('todo:', len(c.todo))
        #print('busy:', len(c.busy))
        print('done:', len(c.done), '; ok:', sum(c.done.values()))
        #print('tasks:', len(c.tasks))
    finally:
        loop.close()
        print



if __name__ == '__main__':
    if '--iocp' in sys.argv:
        from trollius import events, windows_events
        sys.argv.remove('--iocp')
        logging.info('using iocp')
        el = windows_events.ProactorEventLoop()
        events.set_event_loop(el)

    main()

Reply via email to