Hi there everyone, good evening! I used Scrapy to crawl some musics from this site: http://www.vagalume.com.br/ (here vagalume.json was generated)
The idea now is to crawl the same musics I crawled from the site above in this another site: https://www.letras.mus.br I tried to read the data from vagalume.json and search each music in the site above, but the div from xpath returns empty. I think the reason for that is that the spider finishes to read the search page before it returns the query from the server. I'm not sure though. What can I do about it? Here is the code (the current parse method I was using for debug) # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class MusicItem(scrapy.Item): name = scrapy.Field() author = scrapy.Field() lyrics = scrapy.Field() import scrapy import json from Letras.items import MusicItem class LetrasSpider(scrapy.Spider): name = "letras" allowed_domains = ["letras.mus.br"] start_urls = [ "https://www.letras.mus.br/?q=peter%20hollens%20misty%20mountains" ] def cleanString(self, text): txt = "" for c in text: if c.isalnum(): txt += c else: if c.isspace(): txt += ' ' return txt def retrieveLyrics(self, response): lyrics = "" for sel in response: sentence = sel.extract() lyrics += self.cleanString(sentence) lyrics += ' ' return lyrics def retrieveMusicName(self, response): return self.cleanString(response.xpath('h1/text()')[0].extract()) def retrieveAuthor(self, response): return self.cleanString(response.xpath('h2/a/text()')[0].extract()) def parseOneMusic(self, response): lyrics = self.retrieveLyrics(response.xpath('//div[@class="g-pr g-sp"]/div[@class="cnt-letra p402_premium"]/article/p/text()')) sel = response.xpath('//div[@class="cnt-head cnt-head--l"]/div[@class="cnt-head_title"]') name = self.retrieveMusicName(sel) author = self.retrieveAuthor(sel) item = MusicItem() item['name'] = name item['author'] = author item['lyrics'] = lyrics yield item def parseOneAuthor(self, response): for href in response.xpath('//ul[@class="cnt-list"]/li/a[1]/@href'): url = response.urljoin(href.extract()) yield scrapy.Request(url, callback=self.parseOneMusic) def parseQuery(self, response): url = response.css('.gsc-expansionArea > div:nth-child(1) > div:nth-child(1) > table:nth-child(3) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(2) > div:nth-child(1) > a:nth-child(1)'). extract() self.logger.info("ParseQuery url = {0}".format(url)) return scrapy.Request(url, callback=self.parseOneMusic) def treatAuthorName(self, authorName): return authorName.lower().replace(" ", "-") def parse(self, response): for href in response.xpath( '//div[@class="wrapper"]/div[@id="all"]/div[@id="cnt_top"]/div[@id="res_busca"]/div[@id="resultado"]/div[@class="all"]/div[@id="cse-search-results"]/div' ): self.logger.info("LoggingParse href = {0}\n".format(href)) def parse2(self, response): with open('vagalume.json') as vagalume_file: vagalumeJson = json.load(vagalume_file) for vagalumeItem in vagalumeJson: url = "https://www.letras.mus.br/?q={0} {1}".format( vagalumeItem["author"], vagalumeItem["name"]) yield scrapy.Request(url, callback=self.parseQuery) -- You received this message because you are subscribed to the Google Groups "scrapy-users" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. To post to this group, send email to [email protected]. Visit this group at https://groups.google.com/group/scrapy-users. For more options, visit https://groups.google.com/d/optout.
