Hi,
i have a behavior i need to understand.
My scrapy script request 53 urls (and i check on the webpages, there are 53
urls corresponding to my request) but it returns only 43 items scrapped.
if my code is:
allowed_domains = ['vsetkyfirmy.sk']
start_urls = [
'https://www.vsetkyfirmy.sk/autokempy/',
]
rules = [
Rule(
LinkExtractor(
restrict_xpaths=(u'//*[text()[contains(., "Ďalšie")]]')),
callback='parse_start_url',
follow = True
)
]
page_num = 1
counter = 1
def parse_start_url(self, response):
urls = Selector(response).xpath('//td/a[contains(@id, "detaily")]/@href'
).extract()
for u in urls:
yield {'link' : u}
it returns me correctly 53 urls
but if my code is:
allowed_domains = ['vsetkyfirmy.sk']
start_urls = [
'https://www.vsetkyfirmy.sk/autokempy/',
]
rules = [
Rule(
LinkExtractor(
restrict_xpaths=(u'//*[text()[contains(., "Ďalšie")]]')),
callback='parse_start_url',
follow = True
)
]
page_num = 1
counter = 1
def parse_start_url(self, response):
urls = Selector(response).xpath('//td/a[contains(@id, "detaily")]/@href'
).extract()
for u in urls:
yield scrapy.Request(u, callback=self.parse_company)
def parse_company(self, response):
job = Selector(response).xpath(
'//body/div/table[2]/tbody/tr[3]/td[2]/a/text()').extract()
name = Selector(response).xpath(
'//body/div/table[1]/tbody/tr[1]/td[1]/h1/span/text()').extract()
yield {
"count" : self.counter,
"job" : job,
"company page url" : response.url,
"company" : name,
}
self.counter = self.counter + 1
it returns me only 43.
why ?
thx
--
You received this message because you are subscribed to the Google Groups
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To post to this group, send email to [email protected].
Visit this group at https://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.