Hi, I am new to scrapy and trying to learn. I have used my friend google a lot but I am currently stuck. The pages I need to scrape requires authentication to get hold of price information. I would appreciate all help I can get on this problem. Best regards, Tobias Gårdner
The login page can be found here: http://www.vortexparts.eu/account/login The output I get looks like this: >>>>> start_request <<<<< 2015-01-15 16:35:14+0100 [vortex3] DEBUG: Crawled (200) <GET http://www.vortexparts.eu/account/login> (referer: None) >>>>> login_parse: http://www.vortexparts.eu/account/login <<<<< >>>>> login token: [u'-pJRTcvZyxvZhljpVTVuMSqJo-klwQHfhDAElxBaGb4'] <<<<< 2015-01-15 16:35:14+0100 [vortex3] DEBUG: Redirecting (302) to <GET http://www.vortexparts.eu/account/login> from <POST http://www.vortexparts.eu/changeUserOption> And my spider and items code can be found below. # File: vortex_spider_public.py from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors import LinkExtractor from scrapy.selector import Selector from scrapy.http import FormRequest, Request from vortex.items import VortexCategoryItem, VortexProductItem class vortexSpiderPublic(CrawlSpider): name = "vortex3" allowed_domains = ["vortexparts.eu"] login_url = "http://www.vortexparts.eu/account/login" start_urls = ["http://www.vortexparts.eu/category/above-ground-pool-parts"] rules = ( Rule(LinkExtractor(allow=(r"vortexparts.eu/account/login")), callback='login_parse', follow=True), Rule(LinkExtractor(allow=(r"product/"), deny=('/login',)), callback="parse_items", follow= True), ) def start_requests(self): print ">>>>> start_request <<<<<" yield Request(self.login_url, callback=self.login_parse) def login_parse(self, response): print ">>>>> login_parse: %s <<<<<" % response.url print ">>>>> login token: %s <<<<<" % response.xpath('//input[@id="login__token"]/@value').extract() return FormRequest.from_response(response, formdata={'login[_username]': "[email protected]", \ 'login[_password]': "xxx", \ 'login[_token]': response.xpath('//input[@id="login__token"]/@value').extract()}, \ callback=self.after_login) def after_login(self, response): print ">>>>> after_login: %s <<<<<" % response.url if "Bad credentials" in response.body: print ">>>>> login failed! <<<<<" return else: if "Tobias" in response.body: print ">>>>> login succeeded! Found first name Tobias in the body! <<<<<" else: print ">>>>> login succeeded! BUT did NOT find first name Tobias in the body! <<<<<" return [Request(url=u) for u in self.start_urls] def parse_items(self, response): productItem = VortexProductItem() productItem ["url"] = response.url productItem ["categories"] = response.xpath('//ul[@id="breadcrumbs"]/li/a/@href').extract() return(productItem) Ange koden här... # file: items.py import scrapy class VortexProductItem(scrapy.Item): url = scrapy.Field() categories = scrapy.Field() pass -- You received this message because you are subscribed to the Google Groups "scrapy-users" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. To post to this group, send email to [email protected]. Visit this group at http://groups.google.com/group/scrapy-users. For more options, visit https://groups.google.com/d/optout.
