from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy import log
from urlparse import urlparse
from urlparse import urljoin
from scrapy.http import Request
class MySpider(CrawlSpider):
name = 'testes2'
allowed_domains = ['hoteis.pt']
start_urls = [
'http://www.hoteis.pt/pesquisa/filtro/?tipo=0&local=0'
]
rules =
(Rule(SgmlLinkExtractor(restrict_xpaths=('//a[@id="seguinte"]/@href')),
follow=True),)
def parse(self, response):
sel = Selector(response)
urls = sel.xpath('//div[@id="btReserve"]/../@href').extract()
for url in urls:
url = urljoin(response.url, url)
self.log('URLS: %s' % url)
yield Request(url, callback = self.parseLinks)
def parseLinks(self, response):
sel = Selector(response)
titulo = sel.xpath('h1/text()').extract()
morada = sel.xpath('//div[@class="MORADA"]/text()').extract()
email = sel.xpath('//a[@class="sendMail"][1]/text()')[0].extract()
url = sel.xpath('//div[@class="contentContacto
sendUrl"]/a/text()').extract()
telefone =
sel.xpath('//div[@class="telefone"]/div[@class="contentContacto"]/text()').extract()
fax =
sel.xpath('//div[@class="fax"]/div[@class="contentContacto"]/text()').extract()
descricao = sel.xpath('//div[@id="tbDescricao"]/p/text()').extract()
gps = sel.xpath('//td[@class="sendGps"]/@style').extract()
print titulo, email, morada
--
You received this message because you are subscribed to the Google Groups
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/groups/opt_out.