from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(CrawlSpider):
name = "giidli"
allowed_domains = []
start_urls = ["http://www.wikipedia.org/"]
rules = (
#Rule(SgmlLinkExtractor(allow=(),
restrict_xpaths=('//*[@id=''www-wikipedia-org'']/div[6]/div[3]/div',)),
callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles =
hxs.xpath('//*[@id=''www-wikipedia-org'']/div[6]/div[3]/div')
items = []
for title in titles:
item = CraigslistSampleItem()
item["title"] =
title.select('a/div[2]/span[1]/text()').extract()
item["link"] = title.xpath('a/@href').extract()
items.append(item)
#item1 = CraigslistSampleItem()
#item1["title"] = 'akkad'
#item1["link"] = 'bakkad'
#items.append(item1)
#item2 = CraigslistSampleItem()
#item2["title"] = 'bambe'
#item2["link"] = 'bo'
#items.append(item2)
return(items)
--
You received this message because you are subscribed to the Google Groups
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To post to this group, send email to [email protected].
Visit this group at https://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.