Re: crawl next pages

ajrpc Mon, 13 Jan 2014 09:14:07 -0800

Thank you very much Paul.

I've changed Rules callback to 'parsePage' and renamed def parse to def 
parsePage and now it doesnt enter in the parsePage(), it does nothing. Now 
the code looks like:


from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy import log
from urlparse import urlparse
from urlparse import urljoin
from scrapy.http import Request

class MySpider(CrawlSpider):
    name = 'testes2'
    allowed_domains = ['hoteis.pt']
    start_urls = [
        'http://www.hoteis.pt/pesquisa/filtro/?tipo=0&local=0'
    ]

    rules = 
(Rule(SgmlLinkExtractor(restrict_xpaths=('//a[@id="seguinte"]/@href')), 
callback='parsePage',follow=True),)

    def parsePage(self, response):
         sel = Selector(response)
         urls = sel.xpath('//div[@id="btReserve"]/../@href').extract()
         for url in urls:
          url = urljoin(response.url, url)
          self.log('URLS: %s' % url)
          yield Request(url, callback = self.parseLinks)
        
    def parseLinks(self, response):
    sel = Selector(response)
        titulo = sel.xpath('h1/text()').extract()
        morada = sel.xpath('//div[@class="MORADA"]/text()').extract()
        email = sel.xpath('//a[@class="sendMail"][1]/text()')[0].extract()
        url = sel.xpath('//div[@class="contentContacto 
sendUrl"]/a/text()').extract()
        telefone = 
sel.xpath('//div[@class="telefone"]/div[@class="contentContacto"]/text()').extract()
        fax = 
sel.xpath('//div[@class="fax"]/div[@class="contentContacto"]/text()').extract()
        descricao = sel.xpath('//div[@id="tbDescricao"]/p/text()').extract()
        gps = sel.xpath('//td[@class="sendGps"]/@style').extract()

        print titulo, email, morada

On Monday, January 13, 2014 4:56:11 PM UTC, Paul Tremberth wrote:
>
> Hi,
> I just replied to your StackOverflow question also.
> One problem is that you should not override CrawlSpider's parse method, 
> otherwise the default behaviour following rules and everything will not 
> happen.
>
> /Paul.
>
> On Monday, January 13, 2014 12:13:24 PM UTC+1, ajrpc wrote:
>>
>> Hello,
>>
>> I have set Rules to get the next pages from the start_url, but it's not 
>> working, it only crawls the start_urls page, and the links in that page 
>> (with parseLinks). It doesn't go to the next page set in Rules.
>>
>> any help ?
>>
>> from scrapy.contrib.spiders import CrawlSpider, Rule
>> from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
>> from scrapy.selector import Selector
>> from scrapy import log
>> from urlparse import urlparse
>> from urlparse import urljoin
>> from scrapy.http import Request
>>
>> class MySpider(CrawlSpider):
>>     name = 'testes2'
>>     allowed_domains = ['hoteis.pt']
>>     start_urls = [
>>         'http://www.hoteis.pt/pesquisa/filtro/?tipo=0&local=0'
>>     ]
>>
>>     rules = 
>> (Rule(SgmlLinkExtractor(restrict_xpaths=('//a[@id="seguinte"]/@href')), 
>> follow=True),)
>>
>>     def parse(self, response):
>>          sel = Selector(response)
>>          urls = sel.xpath('//div[@id="btReserve"]/../@href').extract()
>>          for url in urls:
>>           url = urljoin(response.url, url)
>>           self.log('URLS: %s' % url)
>>           yield Request(url, callback = self.parseLinks)
>>         
>>     def parseLinks(self, response):
>>     sel = Selector(response)
>>         titulo = sel.xpath('h1/text()').extract()
>>         morada = sel.xpath('//div[@class="MORADA"]/text()').extract()
>>         email = sel.xpath('//a[@class="sendMail"][1]/text()')[0].extract()
>>         url = sel.xpath('//div[@class="contentContacto 
>> sendUrl"]/a/text()').extract()
>>         telefone = 
>> sel.xpath('//div[@class="telefone"]/div[@class="contentContacto"]/text()').extract()
>>         fax = 
>> sel.xpath('//div[@class="fax"]/div[@class="contentContacto"]/text()').extract()
>>         descricao = 
>> sel.xpath('//div[@id="tbDescricao"]/p/text()').extract()
>>         gps = sel.xpath('//td[@class="sendGps"]/@style').extract()
>>
>>         print titulo, email, morada
>>
>

-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/groups/opt_out.

Re: crawl next pages

Reply via email to