Crawl the web permanently to find expired domains

Romain Marchand Fri, 11 Mar 2016 15:32:17 -0800

Hi,

I need advice to improve my crawler !
The crawl aim to crawl to web without stop and find expired domains.


Can someone help me to improve the code ? 


class MyItem(Item):advice
    url = Field()

class HttpbinSpider(CrawlSpider):

    name = "expired"
    start_urls = ['http://www.website.com']
        
    rules = (
        Rule(LinkExtractor(allow=('.com', '.fr', '.net', '.org', '.info', 
'.casino'),
            deny=('facebook','amazon', 'wordpress', 'blogspot', 'free', 
'reddit', 'fnac', 'tumblr', 'videos', 'youtube', 'google', 'doubleclick', 
'microsoft', 'yahoo', 'bing', 'znet', 'stackexchang', 'twitter', 'wikipedia'
, 'creativecommons', 'mediawiki', 'wikidata'),),
            process_request='add_errback', 
            follow=True),
    )


    custom_settings = {
        'RETRY_ENABLED': True,
        'DEPTH_LIMIT' : 0,
        'DEPTH_PRIORITY' : 1,
        'LOG_ENABLED' : False,
        'CONCURRENT_REQUESTS_PER_DOMAIN' : 32,
        'CONCURRENT_REQUESTS' : 64,
    }


    def add_errback(self, request):
        return request.replace(errback=self.errback_httpbin)

    def errback_httpbin(self, failure):

        if failure.check(DNSLookupError):
            item = MyItem()
            item['url'] = ""
            request = failure.request
            self.logger.info('## Domain Expired : %s', request.url)
            ext = tldextract.extract(request.url)
            insert_table(ext.registered_domain)
            item['url'] = ext.registered_domain
            yield item



-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at https://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.

Crawl the web permanently to find expired domains

Reply via email to