crawl next pages

ajrpc Mon, 13 Jan 2014 03:26:59 -0800

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy import log
from urlparse import urlparse
from urlparse import urljoin
from scrapy.http import Request


class MySpider(CrawlSpider):
    name = 'testes2'
    allowed_domains = ['hoteis.pt']
    start_urls = [
        'http://www.hoteis.pt/pesquisa/filtro/?tipo=0&local=0'
    ]

    rules = 
(Rule(SgmlLinkExtractor(restrict_xpaths=('//a[@id="seguinte"]/@href')), 
follow=True),)

    def parse(self, response):
         sel = Selector(response)
         urls = sel.xpath('//div[@id="btReserve"]/../@href').extract()
         for url in urls:
          url = urljoin(response.url, url)
          self.log('URLS: %s' % url)
          yield Request(url, callback = self.parseLinks)
        
    def parseLinks(self, response):
    sel = Selector(response)
        titulo = sel.xpath('h1/text()').extract()
        morada = sel.xpath('//div[@class="MORADA"]/text()').extract()
        email = sel.xpath('//a[@class="sendMail"][1]/text()')[0].extract()
        url = sel.xpath('//div[@class="contentContacto 
sendUrl"]/a/text()').extract()
        telefone = 
sel.xpath('//div[@class="telefone"]/div[@class="contentContacto"]/text()').extract()
        fax = 
sel.xpath('//div[@class="fax"]/div[@class="contentContacto"]/text()').extract()
        descricao = sel.xpath('//div[@id="tbDescricao"]/p/text()').extract()
        gps = sel.xpath('//td[@class="sendGps"]/@style').extract()

        print titulo, email, morada

-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/groups/opt_out.

crawl next pages

Reply via email to