Hi,
I need advice to improve my crawler !
The crawl aim to crawl to web without stop and find expired domains.
Can someone help me to improve the code ?
class MyItem(Item):advice
url = Field()
class HttpbinSpider(CrawlSpider):
name = "expired"
start_urls = ['http://www.website.com']
rules = (
Rule(LinkExtractor(allow=('.com', '.fr', '.net', '.org', '.info',
'.casino'),
deny=('facebook','amazon', 'wordpress', 'blogspot', 'free',
'reddit', 'fnac', 'tumblr', 'videos', 'youtube', 'google', 'doubleclick',
'microsoft', 'yahoo', 'bing', 'znet', 'stackexchang', 'twitter', 'wikipedia'
, 'creativecommons', 'mediawiki', 'wikidata'),),
process_request='add_errback',
follow=True),
)
custom_settings = {
'RETRY_ENABLED': True,
'DEPTH_LIMIT' : 0,
'DEPTH_PRIORITY' : 1,
'LOG_ENABLED' : False,
'CONCURRENT_REQUESTS_PER_DOMAIN' : 32,
'CONCURRENT_REQUESTS' : 64,
}
def add_errback(self, request):
return request.replace(errback=self.errback_httpbin)
def errback_httpbin(self, failure):
if failure.check(DNSLookupError):
item = MyItem()
item['url'] = ""
request = failure.request
self.logger.info('## Domain Expired : %s', request.url)
ext = tldextract.extract(request.url)
insert_table(ext.registered_domain)
item['url'] = ext.registered_domain
yield item
--
You received this message because you are subscribed to the Google Groups
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To post to this group, send email to [email protected].
Visit this group at https://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.