crawl next pages

ajrpc Mon, 13 Jan 2014 03:23:05 -0800

Hello,

I have set Rules to get the next pages from the start_url, but it's not 
working, it only crawls the start_urls page, and the links in that page 
(with parseLinks). It doesn't go to the next page set in Rules.


any help ?

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy import log
from urlparse import urlparse
from urlparse import urljoin
from scrapy.http import Request

class MySpider(CrawlSpider):
    name = 'testes2'
    allowed_domains = ['hoteis.pt']
    start_urls = [
        'http://www.hoteis.pt/pesquisa/filtro/?tipo=0&local=0'
    ]

    rules = 
(Rule(SgmlLinkExtractor(restrict_xpaths=('//a[@id="seguinte"]/@href')), 
follow=True),)

    def parse(self, response):
         sel = Selector(response)
         urls = sel.xpath('//div[@id="btReserve"]/../@href').extract()
         for url in urls:
          url = urljoin(response.url, url)
          self.log('URLS: %s' % url)
          yield Request(url, callback = self.parseLinks)
        
    def parseLinks(self, response):
    sel = Selector(response)
        titulo = sel.xpath('h1/text()').extract()
        morada = sel.xpath('//div[@class="MORADA"]/text()').extract()
        email = sel.xpath('//a[@class="sendMail"][1]/text()')[0].extract()
        url = sel.xpath('//div[@class="contentContacto 
sendUrl"]/a/text()').extract()
        telefone = 
sel.xpath('//div[@class="telefone"]/div[@class="contentContacto"]/text()').extract()
        fax = 
sel.xpath('//div[@class="fax"]/div[@class="contentContacto"]/text()').extract()
        descricao = sel.xpath('//div[@id="tbDescricao"]/p/text()').extract()
        gps = sel.xpath('//td[@class="sendGps"]/@style').extract()

        print titulo, email, morada

-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/groups/opt_out.

crawl next pages

Reply via email to