#Scrapy with python script:


from scrapy.spider import BaseSpider 
from scrapy.selector import HtmlXPathSelector 
from scrapy.http.request import Request
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.spider import Spider
from scrapy.selector import Selector
import urlparse 


from scrapy.item import Item, Field
class ScrapyDemoSpiderItem(Item):


    title = Field()
    link = Field()
    content = Field()
    Companyname = Field()
    type = Field()
    city = Field()


class ScrapyDemoSpider(BaseSpider):
        name = 'hrapply'
        allowed_domains = ['hrapply.com']
        start_urls = 
['https://www.hrapply.com/thecosmopolitanoflasvegas/CompositeSearch.app']
    
        def parse(self,response):
            hxs = Selector(response) 
            listings = hxs.select('.//*[@id="/content/JobListDetail"]') 
            links = []

    #scrap listings page to get listing links
            for listing in listings: 
                
link=listing.select('.//*[@id="/content/JobListDetail"]/table/tbody/tr[3]/td[1]/a/@href').extract()
                
                links.extend(link)
    
    #parse listing url to get content of the listing page

            for link in links: 
                item=ScrapyDemoSpiderItem() 
                item['link']=link
                
                yield Request(urlparse.urljoin(response.url, link), 
meta={'item':item},callback=self.parse_listing_page)
                
            

    

     #scrap listing page to get content
        def parse_listing_page(self,response):
                hxs = Selector(response) 
                item = response.request.meta['item']
                item ['link'] = response.url
                item['Companyname'] = "Findly"
                item['title'] = 
hxs.select("//div[@class='labelDef']/text()").extract()
                item['content'] = 
hxs.select("//div[@id='resumator-job-description']/p/text()").extract()
                
                
                yield item



# Error :


Microsoft Windows XP [Version 5.1.2600]
(C) Copyright 1985-2001 Microsoft Corp.

C:\Documents and Settings\sureshp>cd D:\Scrapy\New Folder\New Folder\New 
Folder\
Scrapy_pro\Scrapy_pro\cosmopolitanlasvegascom

C:\Documents and Settings\sureshp>d:

D:\Scrapy\New Folder\New Folder\New 
Folder\Scrapy_pro\Scrapy_pro\cosmopolitanlas
vegascom>scrapy crawl hrapply -o test.json -t json
cosmopolitanlasvegascom\spiders\cosmopolitanlasvegascom.py:23: 
ScrapyDeprecation
Warning: 
cosmopolitanlasvegascom.spiders.cosmopolitanlasvegascom.ScrapyDemoSpide
r inherits from deprecated class scrapy.spider.BaseSpider, please inherit 
from s
crapy.spider.Spider. (warning only on first subclass, there may be others)
  class ScrapyDemoSpider(BaseSpider):
2014-09-03 21:06:31+0530 [scrapy] INFO: Scrapy 0.24.2 started (bot: 
cosmopolitan
lasvegascom)
2014-09-03 21:06:31+0530 [scrapy] INFO: Optional features available: ssl, 
http11

2014-09-03 21:06:31+0530 [scrapy] INFO: Overridden settings: 
{'NEWSPIDER_MODULE'
: 'cosmopolitanlasvegascom.spiders', 'FEED_FORMAT': 'json', 
'SPIDER_MODULES': ['
cosmopolitanlasvegascom.spiders'], 'FEED_URI': 'test.json', 'BOT_NAME': 
'cosmopo
litanlasvegascom'}
2014-09-03 21:06:31+0530 [scrapy] INFO: Enabled extensions: FeedExporter, 
LogSta
ts, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState
2014-09-03 21:06:32+0530 [scrapy] INFO: Enabled downloader middlewares: 
HttpAuth
Middleware, DownloadTimeoutMiddleware, UserAgentMiddleware, 
RetryMiddleware, Def
aultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, 
Redirec
tMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2014-09-03 21:06:32+0530 [scrapy] INFO: Enabled spider middlewares: 
HttpErrorMid
dleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, 
DepthMiddlew
are
2014-09-03 21:06:32+0530 [scrapy] INFO: Enabled item pipelines:
2014-09-03 21:06:32+0530 [hrapply] INFO: Spider opened
2014-09-03 21:06:32+0530 [hrapply] INFO: Crawled 0 pages (at 0 pages/min), 
scrap
ed 0 items (at 0 items/min)
2014-09-03 21:06:32+0530 [scrapy] DEBUG: Telnet console listening on 
127.0.0.1:6
023
2014-09-03 21:06:32+0530 [scrapy] DEBUG: Web service listening on 
127.0.0.1:6080

2014-09-03 21:06:33+0530 [hrapply] DEBUG: Crawled (200) <GET 
https://www.hrapply
.com/thecosmopolitanoflasvegas/CompositeSearch.app> (referer: None)
cosmopolitanlasvegascom\spiders\cosmopolitanlasvegascom.py:30: 
ScrapyDeprecation
Warning: Call to deprecated function select. Use .xpath() instead.
  listings = hxs.select('.//*[@id="/content/JobListDetail"]')
2014-09-03 21:06:33+0530 [hrapply] INFO: Closing spider (finished)
2014-09-03 21:06:33+0530 [hrapply] INFO: Dumping Scrapy stats:
        {'downloader/request_bytes': 259,
         'downloader/request_count': 1,
         'downloader/request_method_count/GET': 1,
         'downloader/response_bytes': 7253,
         'downloader/response_count': 1,
         'downloader/response_status_count/200': 1,
         'finish_reason': 'finished',
         'finish_time': datetime.datetime(2014, 9, 3, 15, 36, 33, 656000),
         'log_count/DEBUG': 3,
         'log_count/INFO': 7,
         'response_received_count': 1,
         'scheduler/dequeued': 1,
         'scheduler/dequeued/memory': 1,
         'scheduler/enqueued': 1,
         'scheduler/enqueued/memory': 1,
         'start_time': datetime.datetime(2014, 9, 3, 15, 36, 32, 312000)}
2014-09-03 21:06:33+0530 [hrapply] INFO: Spider closed (finished)

D:\Scrapy\New Folder\New Folder\New 
Folder\Scrapy_pro\Scrapy_pro\cosmopolitanlas
vegascom>


Please let me know your views

-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.

Reply via email to