Hi all,
I'm trying to start my first scrapy project and I got stuck with a weird
problem.
So, I'm controlling scrapy via a Python script:
# -*- coding: utf-8 -*-
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from scrapy.utils.project import get_project_settings
from govcrawl.spiders.main_spider import DomainSpider
import sys, urlparse, re
from scrapy.contrib.spiders import Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
args = sys.argv[1].split('§')
url_id = args[0]
start_url = args[1]
url_parts = urlparse.urlparse(start_url)
allowed_domain = url_parts.netloc
allowed_path = '/'.join(url_parts.path.split('/')[:-1])
cur_state = sys.argv[2]
spider = DomainSpider(
start_urls = [start_url],
allowed_domains = [allowed_domain],
url_id = url_id,
cur_state = cur_state,
rules = (
Rule(
LxmlLinkExtractor(
allow = re.compile(r".*%s.*" % re.escape(allowed_path), re.
IGNORECASE),
allow_domains = [allowed_domain],
tags = ('a', 'area', 'frame'),
attrs = ('href', 'src')
),
callback = "parse_items",
follow = True
),
)
)
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal = signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()
Basically I'm passing to this script via standard output my urls.
Then, my DomainSpider looks like this:
import re
from govcrawl.items import DomainItem
from scrapy.utils.markup import remove_tags
from scrapy.contrib.spiders import CrawlSpider
from scrapy import log
class DomainSpider(CrawlSpider):
name = "govcrawl_main"
def parse_start_url(self, response):
return self.parse_items(response)
def parse_items(self, response):
pages_done = self.crawler.stats.get_value(
'downloader/response_count')
pages_todo = self.crawler.stats.get_value('scheduler/enqueued') -
self.crawler.stats.get_value('downloader/response_count')
log.msg("URL: %s (%s) Crawled %d pages. To Crawl: %d" % (self.
start_urls[0], self.url_id, pages_done, pages_todo), spider = self)
links = []
for sel in response.xpath('//a'):
href = sel.xpath('@href').extract()
if len(href) > 0:
href = href[0]
if href.startswith("http"):
links.append(href)
item = DomainItem()
item["url"] = response.url
item["text"] = re.sub(r'\s{2,}', ' ', remove_tags(' '.join(response.
xpath('//body//text()').extract()))).strip()
item["links"] = links
self.crawler.stats.inc_value('pages_crawled')
yield item
The problem is that this works perfectly with some websites and not at all
with others.
For example:
works like a charm:
http://www.attleboroschools.com/schools/studley_elementary/index.php
doesn't follow any link, doesn't enter in parse_items:
http://www.mass.gov/eea/agencies/dfg/der/
I checked with the scrapy shell with the second url, ran view(response) and
saw in the browser the source of the page, which is perfectly matching the
original one and it includes all the links that satisfy the Rule, but the
crawler doesn't follow them...
Any idea how to dela with this?
Thanks!
Michele C
--
You received this message because you are subscribed to the Google Groups
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.