how to insert data in mysql from scrapy

Anurag Sharma Fri, 05 Dec 2014 05:53:23 -0800

i have attached my all code 
i am not able to insert data from scrapy to mysql
after attached code i am running following command


scrapy crawl amazon

please help me 
how can we insert

thansk

-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.

# Define here the models for your scraped items
import scrapy

class AmazonDepartmentItem(scrapy.Item):
    title = scrapy.Field()
    link = scrapy.Field()
    desc = scrapy.Field(serializer=str)

# import scrapy

# class Example_spider(BaseSpider):
#    name = "amazon"
#    allowed_domains = ["www.amazon.com"]

#    def start_requests(self):
#        yield self.make_requests_from_url("http://www.amazon.com/Televisions-Video/b/ref=nav_shopall_tv?ie=UTF8&node=1266092011";)

#    def parse(self, response):
#        hxs = HtmlXPathSelector(response)
#        urls = hxs.select('//li[@class="refinementImage"]/a/@href').extract()
#        for i in urls:
#            yield Request(urljoin("http://www.amazon.com/";, i[1:]), callback=self.parse_url)

#    def parse_url(self, response):
#            hxs = HtmlXPathSelector(response)
#            main =   hxs.select('//div[@id="bookshelf-bg"]')
#            items = []
#            for i in main: 
#            item = Exampleitem()
#            item['book_name'] = i.select('div[@class="slickwrap full"]/div[@id="bookstore_detail"]/div[@class="book_listing clearfix"]/div[@class="bookstore_right"]/div[@class="title_and_byline"]/p[@class="book_title"]/text()')[0].extract()
#            item['price'] = i.select('div[@id="book-sidebar-modules"]/div[@class="add_to_cart_wrapper slickshadow"]/div[@class="panes"]/div[@class="pane clearfix"]/div[@class="inner"]/div[@class="add_to_cart 0"]/form/div[@class="line-item"]/div[@class="line-item-price"]/text()').extract()
#            items.append(item)
#        return items






# get the information of all departement shop by
# spider of all department
import scrapy

from craigslist_sample.items import AmazonDepartmentItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor

class AmazonAllDepartmentSpider(scrapy.Spider):

    name = "amazon"
    allowed_domains = ["amazon.com"]
    start_urls = [
        "http://www.amazon.com/gp/site-directory/ref=nav_sad/187-3757581-3331414";
    ]
    def parse(self, response):
        for sel in response.xpath('//ul/li'):
            item = AmazonDepartmentItem()
            item['title'] = sel.xpath('a/text()').extract()
            item['link'] = sel.xpath('a/@href').extract()
            item['desc'] = sel.xpath('text()').extract()
        return item



# get seller info of product category
# spider of seller
# class AmazonSellerSpider(scrapy.Spider):
#     name = "amazon"
#     allowed_domains = ["amazon.COM"]
#     start_urls = [
#         "http://www.amazon.com/Televisions-Video/b/ref=sd_allcat_tv?ie=UTF8&node=1266092011";
#     ]

#     rules = (
#         # Extract links matching 'category.php' (but not matching 'subsection.php')
#         # and follow links from them (since no callback means follow=True by default).
#         Rule(LinkExtractor(allow=('other', ), deny=('/s/ref',))),
#         # Extract links matching 'item.php' and parse them with the spider's method parse_item
#         Rule(LinkExtractor(allow=('other', )), callback='parse_item'),
#     )
#     def parse(self, response):
#         for sel in response.xpath('//ul[@id="ref_303116011"]/li'):
#             item = AmazonDepartmentItem()
#             item['title'] = sel.xpath('a/text()').extract()
#             item['link'] = sel.xpath('a/@href').extract()
#             item['desc'] = sel.xpath('string(.)').extract()
#             yield item




# get product info of product category
# spider of product
# class AmazonProductSpider(scrapy.Spider):
#     name = "amazon"
#     allowed_domains = ["amazon.com"]
#     start_urls = [
#         "http://www.amazon.com/s/ref=sr_in_-2_p_6_5?fst=as%3Aoff&rh=n%3A172282%2Cn%3A%21493964%2Cn%3A1266092011%2Cp_6%3AATVPDKIKX0DER&bbn=1266092011&ie=UTF8&qid=1416570979&rnid=303116011";
#     ]
#     def parse(self, response):
#         for sel in response.xpath('//div[@class="s-item-container"]'):
#             item = AmazonDepartmentItem()
#             item['title'] = sel.xpath('a/text()').extract()
#             item['link'] = sel.xpath('a/@href').extract()
#             item['desc'] = sel.xpath('string(.)').extract(),
#             # item['desc'] = sel.xpath('@*').extract(),

#             yield item



# jst get all department from web
# class AmazonDepartmentSpider(scrapy.Spider):
#     name = "amazon"
#     allowed_domains = ["amazon.com"]
#     start_urls = [
#         "http://www.amazon.com";
#     ]
#     def parse(self, response):
#         for sel in response.xpath('//div[@id=nav-bar-left"]'):
#             item = AmazonDepartmentItem()
#             item['title'] = sel.xpath('a/text()').extract()
#             item['link'] = sel.xpath('a/@href/text()').extract()
#             item['desc'] = sel.xpath('text()').extract()

#             yield item






# from scrapy.spider import BaseSpider
# from scrapy.selector import HtmlXPathSelector
# from craigslist_sample.items import CraigslistSampleItem

# class MySpider(BaseSpider):
#   name = "craig"
#   allowed_domains = ["amazon.com"]
#   start_urls = ["http://www.amazon.com/Televisions-Video/b/ref=nav_shopall_tv?ie=UTF8&node=1266092011";]

#   def parse(self, response):
#       hxs = HtmlXPathSelector(response)
#       titles = hxs.select("//li[@class='refinementImage']")
#       items = []
#       for titles in titles:
#           item = CraigslistSampleItem()
#           item ["title"] = titles.select("a/text()").extract()
#           item ["link"] = titles.select("a/@href").extract()
#           items.append(item)
#       return items



# from scrapy.spider import BaseSpider
# from scrapy.selector import HtmlXPathSelector
# from craigslist_sample.items import CraigslistSampleItem


# class MySpider(BaseSpider):
#     name = 'amazon'
#     allowed_domains = ['amazon.com']
#     start_urls = ['http://www.amazon.com/']

#     # rules = (
#     #     # Extract links matching 'category.php' (but not matching 'subsection.php')
#     #     # and follow links from them (since no callback means follow=True by default).
#     #     Rule(LinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),

#     #     # Extract links matching 'item.php' and parse them with the spider's method parse_item
#     #     Rule(LinkExtractor(allow=('item\.php', )), callback='parse_item'),
#     # )

#     def parse_item(self, response):
#         self.log('Hi, this is an item page! %s' % response.url)
#         item = CraigslistSampleItem()
#         #item['idS'] = response.xpath('//div [@id="ns_1MWNEJVV8STM0FC8Y4HQ_1948_ItemRow"]/text()').re(r'ID: (\d+)')
#         item['name'] = response.xpath('//div[@id="ns_1MWNEJVV8STM0FC8Y4HQ_1948_ItemRow"]/text()').extract()
#         item['description'] = response.xpath('//div[@id="ns_1MWNEJVV8STM0FC8Y4HQ_1948_ItemRow"]/text()').extract()
#         return item


# from scrapy.spider import BaseSpider
# from scrapy.selector import HtmlXPathSelector
# from craigslist_sample.items import CraigslistSampleItem

# class MySpider(BaseSpider):
#   name = "craig"
#   allowed_domains = ["amazon.com"]
#   start_urls = ["http://www.amazon.com";]

# # def parse(self, response):
# #         for sel in response.xpath('//ul/li'):
# #             item = CraigslistSampleItem()
# #             item['title'] = sel.xpath('a/text()').extract()
# #             item['link'] = sel.xpath('a/@href').extract()
# #             yield item


#   def parse(self, response):
#       hxs = HtmlXPathSelector(response)
#       titles = hxs.select("//li[@class='nav_first nav_subcat_link nav_pop_li']")
#       items = []
#       for titles in titles:
#           item = CraigslistSampleItem()
#           item ["title"] = titles.select("a/text()").extract()
#           item ["link"] = titles.select("a/@href").extract()
#           items.append(item)
#       return items



# from scrapy.contrib.spiders import CrawlSpider, Rule
# from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
# from scrapy.selector import HtmlXPathSelector
# from craigslist_sample.items import CraigslistSampleItem

# class MySpider(CrawlSpider):
#     name = "craigs"
#     allowed_domains = ["amazon.com"]
#     start_urls = ["http://www.amazon.com/s?rh=i%3Aappliances%2Cn%3A2619525011%2Cn%3A!2619526011%2Cn%3A3741361%2Cp_n_feature_three_browse-bin%3A2684328011&bbn=3741361&ie=UTF8&pf_rd_p=1925268562&pf_rd_s=merchandised-search-3&pf_rd_t=101&pf_rd_i=2619525011&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=1T97F5ASKND3S5ZKH46G&ref_=acs_ux_ct_2_a1_refTest";]

#     rules = (Rule (SgmlLinkExtractor(allow=("index\d00\.html", ),restrict_xpaths=('//p[@class="nextpage"]',))
#     , callback="parse_items", follow= True),
#     )

#     def parse_items(self, response):
#         hxs = HtmlXPathSelector(response)
#         titles = hxs.select('//span[@class="pl"]')
#         items = []
#         for titles in titles:
#             item = CraigslistSampleItem()
#             item ["title"] = titles.select("a/text()").extract()
#             item ["link"] = titles.select("a/@href").extract()
#             items.append(item)
#         return(items)


# import scrapy
# from craigslist_sample.items import DmozItem


# class DmozSpider(scrapy.Spider):
#     name = "dmoz"
#     allowed_domains = ["dmoz.org"]
#     start_urls = [
#         "http://www.amazon.com/Televisions-Video/b/ref=nav_shopall_tv?ie=UTF8&node=1266092011";
#     ]

#     def parse(self, response):
#         for sel in response.xpath('//ul/li'):
#             item = DmozItem()
#             item['title'] = sel.xpath('a/text()').extract()
#             item['link'] = sel.xpath('a/@href').extract()
#             # item['desc'] = sel.xpath('text()').extract()
#             yield item

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import sys
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request

class MySQLStorePipeline(object):


    host = 'rerhr.com'
    user = 'amazon'
    password = 'sads23'
    db = 'amazon_project'

    def __init__(self):
        self.connection = MySQLdb.connect(self.host, self.user, self.password, self.db)
        self.cursor = self.connection.cursor()

    def process_item(self, item, spider):    
        try:
            self.cursor.execute("""INSERT INTO amazon_project.ProductDepartment (ProductDepartmentLilnk)  
                            VALUES (%s)""", 
                           ( 
                            item['link'].encode('utf-8')))

            self.conn.commit()

        except MySQLdb.Error, e:
            print "Error %d: %s" % (e.args[0], e.args[1])
        return item

how to insert data in mysql from scrapy

Reply via email to