How to get content from docx file using scrapy with python

james Wed, 10 Sep 2014 02:29:15 -0700

Here i have extract content from docx file using scrapy with python, here i 
have used python-docx for extract docx content.


I am wondering to get docx content using scrapy with python

Please let me know your views

hope you
Thanks



import StringIO
from functools import partial
from scrapy.http import Request
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import Spider
from scrapy.selector import Selector
import urlparse
import sys
from docx import opendocx, getdocumenttext



from scrapy.item import Item, Field

class wordSpiderItem(Item):

    link = Field()
    title = Field()
    Description = Field()

class wordSpider(CrawlSpider):
    
    name = "harrysfresh"

    # Stay within these domains when crawling
    allowed_domains = ["harrysfresh.com"]
    start_urls = ["https://www.harrysfresh.com/about-us/careers/";]

   
    def parse(self,response):
##            hxs = Selector(response) 
            listings = response.xpath('//div[@class="entry-content"]')
            links = []

    #scrap listings page to get listing links
            for listing in listings: 
                
link=listing.xpath('//span[@class="news_box"]/a/@href').extract()
            
                links.extend(link)
    
    #parse listing url to get content of the listing page

            for link in links: 
                item=wordSpiderItem()
                item['link']=link
                if "docx" in link:
                                    yield 
Request(urlparse.urljoin(response.url, link), 
meta={'item':item},callback=self.parse_data)
       
        
        
        def parse_data(self, response):
            
        
            job = wordSpiderItem()
            job['link'] = response.url
        print "pythontest"
            stream = StringIO.StringIO(response.body)
        
            reader = getdocumenttext(stream)
            for page in reader.pages:
               job['Description'] = page.extractText()
               
               return job
            

-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.

How to get content from docx file using scrapy with python

Reply via email to