Hi there everyone, good evening!

I used Scrapy to crawl some musics from this site: 
http://www.vagalume.com.br/ (here vagalume.json was generated)

The idea now is to crawl the same musics I crawled from the site above in 
this another site: https://www.letras.mus.br

I tried to read the data  from vagalume.json and search each music in the 
site above, but the div from xpath returns empty.

I think the reason for that is that the spider finishes to read the search 
page before it returns the query from the server. I'm not sure though. What 
can I do about it?

Here is the code (the current parse method I was using for debug)

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class MusicItem(scrapy.Item):
    name = scrapy.Field()
    author = scrapy.Field()
    lyrics = scrapy.Field()




import scrapy
import json

from Letras.items import MusicItem

class LetrasSpider(scrapy.Spider):
    name = "letras"
    allowed_domains = ["letras.mus.br"]
    start_urls = [
        "https://www.letras.mus.br/?q=peter%20hollens%20misty%20mountains";
    ]

    def cleanString(self, text):
        txt = ""

        for c in text:
            if c.isalnum():
                txt += c
            else:
                if c.isspace():
                    txt += ' '

        return txt

    def retrieveLyrics(self, response):
        lyrics = ""

        for sel in response:
            sentence = sel.extract()
            lyrics += self.cleanString(sentence)
            lyrics += ' '

        return lyrics

    def retrieveMusicName(self, response):
        return self.cleanString(response.xpath('h1/text()')[0].extract())

    def retrieveAuthor(self, response):
        return self.cleanString(response.xpath('h2/a/text()')[0].extract())

    def parseOneMusic(self, response):
        lyrics = self.retrieveLyrics(response.xpath('//div[@class="g-pr 
g-sp"]/div[@class="cnt-letra p402_premium"]/article/p/text()'))
        sel = response.xpath('//div[@class="cnt-head 
cnt-head--l"]/div[@class="cnt-head_title"]')
        name = self.retrieveMusicName(sel)
        author = self.retrieveAuthor(sel)
        item = MusicItem()
        item['name'] = name
        item['author'] = author
        item['lyrics'] = lyrics

        yield item

    def parseOneAuthor(self, response):
        for href in response.xpath('//ul[@class="cnt-list"]/li/a[1]/@href'):
            url = response.urljoin(href.extract())
            yield scrapy.Request(url, callback=self.parseOneMusic)

    def parseQuery(self, response):
        url = response.css('.gsc-expansionArea > div:nth-child(1) > 
div:nth-child(1) > table:nth-child(3) > tbody:nth-child(1) > 
tr:nth-child(1) > td:nth-child(2) > div:nth-child(1) > a:nth-child(1)').
extract()
        self.logger.info("ParseQuery url = {0}".format(url))
        return scrapy.Request(url, callback=self.parseOneMusic)

    def treatAuthorName(self, authorName):
        return authorName.lower().replace(" ", "-")

    def parse(self, response):
        for href in response.xpath(
'//div[@class="wrapper"]/div[@id="all"]/div[@id="cnt_top"]/div[@id="res_busca"]/div[@id="resultado"]/div[@class="all"]/div[@id="cse-search-results"]/div'
):
            self.logger.info("LoggingParse href = {0}\n".format(href))

    def parse2(self, response):
        with open('vagalume.json') as vagalume_file:
            vagalumeJson = json.load(vagalume_file)

            for vagalumeItem in vagalumeJson:
                url = "https://www.letras.mus.br/?q={0} {1}".format(
vagalumeItem["author"], vagalumeItem["name"])
                yield scrapy.Request(url, callback=self.parseQuery)



-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at https://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.

Reply via email to