i have attached my all code
i am not able to insert data from scrapy to mysql
after attached code i am running following command
scrapy crawl amazon
please help me
how can we insert
thansk
--
You received this message because you are subscribed to the Google Groups
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.
# Define here the models for your scraped items
import scrapy
class AmazonDepartmentItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
desc = scrapy.Field(serializer=str)# import scrapy
# class Example_spider(BaseSpider):
# name = "amazon"
# allowed_domains = ["www.amazon.com"]
# def start_requests(self):
# yield self.make_requests_from_url("http://www.amazon.com/Televisions-Video/b/ref=nav_shopall_tv?ie=UTF8&node=1266092011")
# def parse(self, response):
# hxs = HtmlXPathSelector(response)
# urls = hxs.select('//li[@class="refinementImage"]/a/@href').extract()
# for i in urls:
# yield Request(urljoin("http://www.amazon.com/", i[1:]), callback=self.parse_url)
# def parse_url(self, response):
# hxs = HtmlXPathSelector(response)
# main = hxs.select('//div[@id="bookshelf-bg"]')
# items = []
# for i in main:
# item = Exampleitem()
# item['book_name'] = i.select('div[@class="slickwrap full"]/div[@id="bookstore_detail"]/div[@class="book_listing clearfix"]/div[@class="bookstore_right"]/div[@class="title_and_byline"]/p[@class="book_title"]/text()')[0].extract()
# item['price'] = i.select('div[@id="book-sidebar-modules"]/div[@class="add_to_cart_wrapper slickshadow"]/div[@class="panes"]/div[@class="pane clearfix"]/div[@class="inner"]/div[@class="add_to_cart 0"]/form/div[@class="line-item"]/div[@class="line-item-price"]/text()').extract()
# items.append(item)
# return items
# get the information of all departement shop by
# spider of all department
import scrapy
from craigslist_sample.items import AmazonDepartmentItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class AmazonAllDepartmentSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ["amazon.com"]
start_urls = [
"http://www.amazon.com/gp/site-directory/ref=nav_sad/187-3757581-3331414"
]
def parse(self, response):
for sel in response.xpath('//ul/li'):
item = AmazonDepartmentItem()
item['title'] = sel.xpath('a/text()').extract()
item['link'] = sel.xpath('a/@href').extract()
item['desc'] = sel.xpath('text()').extract()
return item
# get seller info of product category
# spider of seller
# class AmazonSellerSpider(scrapy.Spider):
# name = "amazon"
# allowed_domains = ["amazon.COM"]
# start_urls = [
# "http://www.amazon.com/Televisions-Video/b/ref=sd_allcat_tv?ie=UTF8&node=1266092011"
# ]
# rules = (
# # Extract links matching 'category.php' (but not matching 'subsection.php')
# # and follow links from them (since no callback means follow=True by default).
# Rule(LinkExtractor(allow=('other', ), deny=('/s/ref',))),
# # Extract links matching 'item.php' and parse them with the spider's method parse_item
# Rule(LinkExtractor(allow=('other', )), callback='parse_item'),
# )
# def parse(self, response):
# for sel in response.xpath('//ul[@id="ref_303116011"]/li'):
# item = AmazonDepartmentItem()
# item['title'] = sel.xpath('a/text()').extract()
# item['link'] = sel.xpath('a/@href').extract()
# item['desc'] = sel.xpath('string(.)').extract()
# yield item
# get product info of product category
# spider of product
# class AmazonProductSpider(scrapy.Spider):
# name = "amazon"
# allowed_domains = ["amazon.com"]
# start_urls = [
# "http://www.amazon.com/s/ref=sr_in_-2_p_6_5?fst=as%3Aoff&rh=n%3A172282%2Cn%3A%21493964%2Cn%3A1266092011%2Cp_6%3AATVPDKIKX0DER&bbn=1266092011&ie=UTF8&qid=1416570979&rnid=303116011"
# ]
# def parse(self, response):
# for sel in response.xpath('//div[@class="s-item-container"]'):
# item = AmazonDepartmentItem()
# item['title'] = sel.xpath('a/text()').extract()
# item['link'] = sel.xpath('a/@href').extract()
# item['desc'] = sel.xpath('string(.)').extract(),
# # item['desc'] = sel.xpath('@*').extract(),
# yield item
# jst get all department from web
# class AmazonDepartmentSpider(scrapy.Spider):
# name = "amazon"
# allowed_domains = ["amazon.com"]
# start_urls = [
# "http://www.amazon.com"
# ]
# def parse(self, response):
# for sel in response.xpath('//div[@id=nav-bar-left"]'):
# item = AmazonDepartmentItem()
# item['title'] = sel.xpath('a/text()').extract()
# item['link'] = sel.xpath('a/@href/text()').extract()
# item['desc'] = sel.xpath('text()').extract()
# yield item
# from scrapy.spider import BaseSpider
# from scrapy.selector import HtmlXPathSelector
# from craigslist_sample.items import CraigslistSampleItem
# class MySpider(BaseSpider):
# name = "craig"
# allowed_domains = ["amazon.com"]
# start_urls = ["http://www.amazon.com/Televisions-Video/b/ref=nav_shopall_tv?ie=UTF8&node=1266092011"]
# def parse(self, response):
# hxs = HtmlXPathSelector(response)
# titles = hxs.select("//li[@class='refinementImage']")
# items = []
# for titles in titles:
# item = CraigslistSampleItem()
# item ["title"] = titles.select("a/text()").extract()
# item ["link"] = titles.select("a/@href").extract()
# items.append(item)
# return items
# from scrapy.spider import BaseSpider
# from scrapy.selector import HtmlXPathSelector
# from craigslist_sample.items import CraigslistSampleItem
# class MySpider(BaseSpider):
# name = 'amazon'
# allowed_domains = ['amazon.com']
# start_urls = ['http://www.amazon.com/']
# # rules = (
# # # Extract links matching 'category.php' (but not matching 'subsection.php')
# # # and follow links from them (since no callback means follow=True by default).
# # Rule(LinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),
# # # Extract links matching 'item.php' and parse them with the spider's method parse_item
# # Rule(LinkExtractor(allow=('item\.php', )), callback='parse_item'),
# # )
# def parse_item(self, response):
# self.log('Hi, this is an item page! %s' % response.url)
# item = CraigslistSampleItem()
# #item['idS'] = response.xpath('//div [@id="ns_1MWNEJVV8STM0FC8Y4HQ_1948_ItemRow"]/text()').re(r'ID: (\d+)')
# item['name'] = response.xpath('//div[@id="ns_1MWNEJVV8STM0FC8Y4HQ_1948_ItemRow"]/text()').extract()
# item['description'] = response.xpath('//div[@id="ns_1MWNEJVV8STM0FC8Y4HQ_1948_ItemRow"]/text()').extract()
# return item
# from scrapy.spider import BaseSpider
# from scrapy.selector import HtmlXPathSelector
# from craigslist_sample.items import CraigslistSampleItem
# class MySpider(BaseSpider):
# name = "craig"
# allowed_domains = ["amazon.com"]
# start_urls = ["http://www.amazon.com"]
# # def parse(self, response):
# # for sel in response.xpath('//ul/li'):
# # item = CraigslistSampleItem()
# # item['title'] = sel.xpath('a/text()').extract()
# # item['link'] = sel.xpath('a/@href').extract()
# # yield item
# def parse(self, response):
# hxs = HtmlXPathSelector(response)
# titles = hxs.select("//li[@class='nav_first nav_subcat_link nav_pop_li']")
# items = []
# for titles in titles:
# item = CraigslistSampleItem()
# item ["title"] = titles.select("a/text()").extract()
# item ["link"] = titles.select("a/@href").extract()
# items.append(item)
# return items
# from scrapy.contrib.spiders import CrawlSpider, Rule
# from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
# from scrapy.selector import HtmlXPathSelector
# from craigslist_sample.items import CraigslistSampleItem
# class MySpider(CrawlSpider):
# name = "craigs"
# allowed_domains = ["amazon.com"]
# start_urls = ["http://www.amazon.com/s?rh=i%3Aappliances%2Cn%3A2619525011%2Cn%3A!2619526011%2Cn%3A3741361%2Cp_n_feature_three_browse-bin%3A2684328011&bbn=3741361&ie=UTF8&pf_rd_p=1925268562&pf_rd_s=merchandised-search-3&pf_rd_t=101&pf_rd_i=2619525011&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=1T97F5ASKND3S5ZKH46G&ref_=acs_ux_ct_2_a1_refTest"]
# rules = (Rule (SgmlLinkExtractor(allow=("index\d00\.html", ),restrict_xpaths=('//p[@class="nextpage"]',))
# , callback="parse_items", follow= True),
# )
# def parse_items(self, response):
# hxs = HtmlXPathSelector(response)
# titles = hxs.select('//span[@class="pl"]')
# items = []
# for titles in titles:
# item = CraigslistSampleItem()
# item ["title"] = titles.select("a/text()").extract()
# item ["link"] = titles.select("a/@href").extract()
# items.append(item)
# return(items)
# import scrapy
# from craigslist_sample.items import DmozItem
# class DmozSpider(scrapy.Spider):
# name = "dmoz"
# allowed_domains = ["dmoz.org"]
# start_urls = [
# "http://www.amazon.com/Televisions-Video/b/ref=nav_shopall_tv?ie=UTF8&node=1266092011"
# ]
# def parse(self, response):
# for sel in response.xpath('//ul/li'):
# item = DmozItem()
# item['title'] = sel.xpath('a/text()').extract()
# item['link'] = sel.xpath('a/@href').extract()
# # item['desc'] = sel.xpath('text()').extract()
# yield item
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import sys
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
class MySQLStorePipeline(object):
host = 'rerhr.com'
user = 'amazon'
password = 'sads23'
db = 'amazon_project'
def __init__(self):
self.connection = MySQLdb.connect(self.host, self.user, self.password, self.db)
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO amazon_project.ProductDepartment (ProductDepartmentLilnk)
VALUES (%s)""",
(
item['link'].encode('utf-8')))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item