Hello,
I have set Rules to get the next pages from the start_url, but it's not
working, it only crawls the start_urls page, and the links in that page
(with parseLinks). It doesn't go to the next page set in Rules.
any help ?
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy import log
from urlparse import urlparse
from urlparse import urljoin
from scrapy.http import Request
class MySpider(CrawlSpider):
name = 'testes2'
allowed_domains = ['hoteis.pt']
start_urls = [
'http://www.hoteis.pt/pesquisa/filtro/?tipo=0&local=0'
]
rules =
(Rule(SgmlLinkExtractor(restrict_xpaths=('//a[@id="seguinte"]/@href')),
follow=True),)
def parse(self, response):
sel = Selector(response)
urls = sel.xpath('//div[@id="btReserve"]/../@href').extract()
for url in urls:
url = urljoin(response.url, url)
self.log('URLS: %s' % url)
yield Request(url, callback = self.parseLinks)
def parseLinks(self, response):
sel = Selector(response)
titulo = sel.xpath('h1/text()').extract()
morada = sel.xpath('//div[@class="MORADA"]/text()').extract()
email = sel.xpath('//a[@class="sendMail"][1]/text()')[0].extract()
url = sel.xpath('//div[@class="contentContacto
sendUrl"]/a/text()').extract()
telefone =
sel.xpath('//div[@class="telefone"]/div[@class="contentContacto"]/text()').extract()
fax =
sel.xpath('//div[@class="fax"]/div[@class="contentContacto"]/text()').extract()
descricao = sel.xpath('//div[@id="tbDescricao"]/p/text()').extract()
gps = sel.xpath('//td[@class="sendGps"]/@style').extract()
print titulo, email, morada
--
You received this message because you are subscribed to the Google Groups
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/groups/opt_out.