Thanks for your response but with this pattern i have the error below on this row do you know why ?
yield Request(url) "exceptions.TypeError : Request url must be str or unicode, got tuple" thanks in advance Le lundi 13 janvier 2014 22:23:49 UTC+1, Paul Tremberth a écrit : > > Hi, > > you may want to try this pattern, overriding your spider's start_requests > method: > See > http://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spider.Spider.start_requests > > import MySQLdb > > > class Crawl2Spider(BaseSpider): > name = "crawl2" > > def start_requests(self): > db = MySQLdb.connect(host="localhost", user="root", passwd="", > db="crawler_engine", charset = 'utf8', use_unicode = False) > cur = db.cursor() > cur.execute("select url from urls where num_crawl=1") > for url in cur.fetchall(): > yield Request(url) > > def parse(self, response): > hxs = HtmlXPathSelector(response) > item = DmozItem() > item['link'] = hxs.select('//div/ul/li/a/@href').extract() > > cursor = self.db.cursor() > for j in range(len(item['link'])): > cursor = self.db.cursor() > sql = "insert into urls(url, domain, num_crawl) values > ('%s','%s','%s')" % (item['link'][j],'test', 1) > cursor.execute(sql) > self.db.commit() > > return item > > Hope it helps. > /Paul. > > On Monday, January 13, 2014 6:31:54 PM UTC+1, d4v1d wrote: >> >> Hello >> I'm on a scrapy project and I would like to put in the start_urls a list >> of urls present in a database. >> I tried that but it doesn't work, it take only the last url in the >> database :-( >> please help me, thank you in advance >> >> [CODE] >> >> class Crawl2Spider(BaseSpider): >> name = "crawl2" >> import MySQLdb >> db = MySQLdb.connect(host="localhost", user="root", passwd="", >> db="crawler_engine", charset = 'utf8', use_unicode = False) >> cur = db.cursor() >> cur.execute("select url from urls where num_crawl=1") >> vers = cur.fetchall() >> for i in range(cur.rowcount): >> start_urls = vers[i] >> def parse(self, response): >> hxs = HtmlXPathSelector(response) >> item = DmozItem() >> item['link'] = hxs.select('//div/ul/li/a/@href').extract() >> cursor = self.db.cursor() >> for j in range(len(item['link'])): >> cursor = self.db.cursor() >> sql = "insert into urls(url, domain, num_crawl) values >> ('%s','%s','%s')" % (item['link'][j],'test', 1) >> cursor.execute(sql) >> self.db.commit() >> return item >> >> [/CODE] >> >> >> -- You received this message because you are subscribed to the Google Groups "scrapy-users" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. To post to this group, send email to [email protected]. Visit this group at http://groups.google.com/group/scrapy-users. For more options, visit https://groups.google.com/groups/opt_out.
