I have the following code, which scraped the data perfectly:
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["reuters.com"]
a = datetime.date(2014, 04, 01)
b = datetime.date(2014, 04, 02)
articles=ET.Element("articles")
urls=["http://www.reuters.com/resources/archive/us/" +
dt.strftime("%Y") +
dt.strftime("%m") + dt.strftime("%d")+".html" for dt in
rrule(DAILY, dtstart=a, until=b)]
def start_requests(self):
date=" "
for url in self.urls:
yield
Request(url=url,meta={'date':str(url)[-10:-4]},callback=self.parse)
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//*[@id="content"]/div[2]/div/div/div[1]')
passed_date=response.meta.get('date')
items=[]
for site in sites:
item = DmozItem()
item['title'] = site.xpath('.//div/a/text()').extract()
item['link'] = site.xpath('.//a/@href').extract()
item['time'] = site.xpath('.//div/text()').extract()
item['date'] = passed_date
items.append(item)
return items
I would like to store the items at the xml file with the following structure
<root>
<article_date>passed_date
<article_time>item['time']
<article_name>item['title']</article_name>
<article_link>item['link']</article_link>
</article_time>
</article_date></root>
What I have tried to write (nothing writes to the selected file):
import xml.etree.cElementTree as ET
class TutorialPipeline(object):
def __init__(self):
dispatcher.connect(self.spider_opened, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
self.files = {}
self.exporters = {}
def spider_opened(self, spider):
file = open('~/Documents/test.xml', 'w+b')
self.files[spider] = file
self.exporters[spider] = XmlItemExporter(file)
self.exporters[spider].start_exporting()
def spider_closed(self, spider):
self.exporters[spider].finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporters[spider].export_item(item)
return item
--
You received this message because you are subscribed to the Google Groups
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.