#Scrapy with python script:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.spider import Spider
from scrapy.selector import Selector
import urlparse
from scrapy.item import Item, Field
class ScrapyDemoSpiderItem(Item):
title = Field()
link = Field()
content = Field()
Companyname = Field()
type = Field()
city = Field()
class ScrapyDemoSpider(BaseSpider):
name = 'hrapply'
allowed_domains = ['hrapply.com']
start_urls =
['https://www.hrapply.com/thecosmopolitanoflasvegas/CompositeSearch.app']
def parse(self,response):
hxs = Selector(response)
listings = hxs.select('.//*[@id="/content/JobListDetail"]')
links = []
#scrap listings page to get listing links
for listing in listings:
link=listing.select('.//*[@id="/content/JobListDetail"]/table/tbody/tr[3]/td[1]/a/@href').extract()
links.extend(link)
#parse listing url to get content of the listing page
for link in links:
item=ScrapyDemoSpiderItem()
item['link']=link
yield Request(urlparse.urljoin(response.url, link),
meta={'item':item},callback=self.parse_listing_page)
#scrap listing page to get content
def parse_listing_page(self,response):
hxs = Selector(response)
item = response.request.meta['item']
item ['link'] = response.url
item['Companyname'] = "Findly"
item['title'] =
hxs.select("//div[@class='labelDef']/text()").extract()
item['content'] =
hxs.select("//div[@id='resumator-job-description']/p/text()").extract()
yield item
# Error :
Microsoft Windows XP [Version 5.1.2600]
(C) Copyright 1985-2001 Microsoft Corp.
C:\Documents and Settings\sureshp>cd D:\Scrapy\New Folder\New Folder\New
Folder\
Scrapy_pro\Scrapy_pro\cosmopolitanlasvegascom
C:\Documents and Settings\sureshp>d:
D:\Scrapy\New Folder\New Folder\New
Folder\Scrapy_pro\Scrapy_pro\cosmopolitanlas
vegascom>scrapy crawl hrapply -o test.json -t json
cosmopolitanlasvegascom\spiders\cosmopolitanlasvegascom.py:23:
ScrapyDeprecation
Warning:
cosmopolitanlasvegascom.spiders.cosmopolitanlasvegascom.ScrapyDemoSpide
r inherits from deprecated class scrapy.spider.BaseSpider, please inherit
from s
crapy.spider.Spider. (warning only on first subclass, there may be others)
class ScrapyDemoSpider(BaseSpider):
2014-09-03 21:06:31+0530 [scrapy] INFO: Scrapy 0.24.2 started (bot:
cosmopolitan
lasvegascom)
2014-09-03 21:06:31+0530 [scrapy] INFO: Optional features available: ssl,
http11
2014-09-03 21:06:31+0530 [scrapy] INFO: Overridden settings:
{'NEWSPIDER_MODULE'
: 'cosmopolitanlasvegascom.spiders', 'FEED_FORMAT': 'json',
'SPIDER_MODULES': ['
cosmopolitanlasvegascom.spiders'], 'FEED_URI': 'test.json', 'BOT_NAME':
'cosmopo
litanlasvegascom'}
2014-09-03 21:06:31+0530 [scrapy] INFO: Enabled extensions: FeedExporter,
LogSta
ts, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState
2014-09-03 21:06:32+0530 [scrapy] INFO: Enabled downloader middlewares:
HttpAuth
Middleware, DownloadTimeoutMiddleware, UserAgentMiddleware,
RetryMiddleware, Def
aultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware,
Redirec
tMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2014-09-03 21:06:32+0530 [scrapy] INFO: Enabled spider middlewares:
HttpErrorMid
dleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware,
DepthMiddlew
are
2014-09-03 21:06:32+0530 [scrapy] INFO: Enabled item pipelines:
2014-09-03 21:06:32+0530 [hrapply] INFO: Spider opened
2014-09-03 21:06:32+0530 [hrapply] INFO: Crawled 0 pages (at 0 pages/min),
scrap
ed 0 items (at 0 items/min)
2014-09-03 21:06:32+0530 [scrapy] DEBUG: Telnet console listening on
127.0.0.1:6
023
2014-09-03 21:06:32+0530 [scrapy] DEBUG: Web service listening on
127.0.0.1:6080
2014-09-03 21:06:33+0530 [hrapply] DEBUG: Crawled (200) <GET
https://www.hrapply
.com/thecosmopolitanoflasvegas/CompositeSearch.app> (referer: None)
cosmopolitanlasvegascom\spiders\cosmopolitanlasvegascom.py:30:
ScrapyDeprecation
Warning: Call to deprecated function select. Use .xpath() instead.
listings = hxs.select('.//*[@id="/content/JobListDetail"]')
2014-09-03 21:06:33+0530 [hrapply] INFO: Closing spider (finished)
2014-09-03 21:06:33+0530 [hrapply] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 259,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 7253,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2014, 9, 3, 15, 36, 33, 656000),
'log_count/DEBUG': 3,
'log_count/INFO': 7,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2014, 9, 3, 15, 36, 32, 312000)}
2014-09-03 21:06:33+0530 [hrapply] INFO: Spider closed (finished)
D:\Scrapy\New Folder\New Folder\New
Folder\Scrapy_pro\Scrapy_pro\cosmopolitanlas
vegascom>
Please let me know your views
--
You received this message because you are subscribed to the Google Groups
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.