Here i have extract content from docx file using scrapy with python, here i
have used python-docx for extract docx content.
I am wondering to get docx content using scrapy with python
Please let me know your views
hope you
Thanks
import StringIO
from functools import partial
from scrapy.http import Request
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import Spider
from scrapy.selector import Selector
import urlparse
import sys
from docx import opendocx, getdocumenttext
from scrapy.item import Item, Field
class wordSpiderItem(Item):
link = Field()
title = Field()
Description = Field()
class wordSpider(CrawlSpider):
name = "harrysfresh"
# Stay within these domains when crawling
allowed_domains = ["harrysfresh.com"]
start_urls = ["https://www.harrysfresh.com/about-us/careers/"]
def parse(self,response):
## hxs = Selector(response)
listings = response.xpath('//div[@class="entry-content"]')
links = []
#scrap listings page to get listing links
for listing in listings:
link=listing.xpath('//span[@class="news_box"]/a/@href').extract()
links.extend(link)
#parse listing url to get content of the listing page
for link in links:
item=wordSpiderItem()
item['link']=link
if "docx" in link:
yield
Request(urlparse.urljoin(response.url, link),
meta={'item':item},callback=self.parse_data)
def parse_data(self, response):
job = wordSpiderItem()
job['link'] = response.url
print "pythontest"
stream = StringIO.StringIO(response.body)
reader = getdocumenttext(stream)
for page in reader.pages:
job['Description'] = page.extractText()
return job
--
You received this message because you are subscribed to the Google Groups
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.