I practice Scrapy and want to ask a question:
** https://eapplicant.northshore.org/psc/psapp/EMPLOYEE/HRMS/c/HRS_HRAM.HRS_CE.GBL** **Please let me know how to select the css selector path for title, description and next page link**. **Language : Python + scrapy + scrapinghub/splash** Spider code : <!-- begin snippet: js hide: false --> <!-- language: lang-html --> import re import urlparse from scrapy.http.request import Request from scrapy.spider import Spider from scrapy.selector import Selector from scrapy.item import Item, Field class NorthshoreSpiderItem(Item): Title = Field() Link = Field() Description = Field() Companyname = Field() class NorthshoreSpider(Spider): name = 'northshore' allowed_domains = ['www.northshore.org'] start_urls = [' https://eapplicant.northshore.org/psc/psapp/EMPLOYEE/HRMS/c/HRS_HRAM.HRS_CE.GBL?& '] def parse(self,response): selector = Selector(response) links = [] for link in selector.css('div.divgbrHRS_ CE_JO_EXT_I$0 tr.trHRS_CE_JO_EXT_I$0_row1 > a.title.heading.win0divPOSTINGTITLE -title::attr(href)').extract(): yield Request(urlparse.urljoin(response.url, link), callback=self.parse_listing_page, #meta={"use_splash": False} ) next_page_link = selector.css('div.pages > a:last-child:not(.disabled)') if next_page_link: def increment10(matchobj): return "st="+str(int(matchobj.group("pagenum"))+10) next_page_url = re.sub('st=(?P<pagenum>\d+)', increment10, response.url) print "next page:", next_page_url yield Request(next_page_url, self.parse, #meta={"use_splash": True}, dont_filter=True) def parse_listing_page(self,response): selector = Selector(response) item=NorthshoreSpiderItem() item['Companyname'] = "Northshore" item ['Link'] = response.url item['Title'] = selector.xpath("//span[@id='HRS_JO_WRK_POSTING_TITLE$0']/text()").extract() item['Description'] = selector.xpath("string(//div[@id='win0divHRS_JO_PST_DSCR$0'])").extract() yield item <!-- end snippet --> Here splash code : <!-- begin snippet: js hide: false --> <!-- language: lang-html --> import re from scrapy import log, signals from scrapy.http import Request from scrapy.exceptions import NotConfigured import w3lib.url _matches = lambda url, regexs: any((r.search(url) for r in regexs)) class SplashMiddleware(object): url = 'http://localhost:8050/render.html' wait = 0.5 url_pass = () url_block = () _settings = [ 'endpoint', 'wait', 'images', 'js', 'filters', 'viewport', ] def __init__(self, crawler): self.crwlr_settings = crawler.settings self.crwlr_stats = crawler.stats @classmethod def from_crawler(cls, crawler): if not crawler.settings.getbool('SPLASH_ENABLED'): raise NotConfigured o = cls(crawler) crawler.signals.connect(o.open_spider, signal=signals.spider_opened) return o def open_spider(self, spider): for k in self._settings: setattr(self, k, self._get_setting_value(spider, self.crwlr_settings, k)) # check URL filters url_pass = self._get_setting_value(spider, self.crwlr_settings, 'url_pass') if url_pass: self.url_pass = [re.compile(x) for x in url_pass] url_block = self._get_setting_value(spider, self.crwlr_settings, 'url_block') if url_block: self.url_block = [re.compile(x) for x in url_block] def _get_setting_value(self, spider, settings, k): o = getattr(self, k, None) s = settings.get('SPLASH_' + k.upper(), o) return getattr(spider, 'splash_' + k, s) def _needs_wrapping(self, request): # already wrapped if request.meta.get("splashed_url", False): return False # force wrap or not use_splash = request.meta.get("use_splash", None) if use_splash is not None: return use_splash == True # check URL regexes if not self.url_pass and not self.url_block: return False if self.url_pass and not _matches(request.url, self.url_pass): return False if self.url_block and _matches(request.url, self.url_block): return False return True def process_request(self, request, spider): if self._needs_wrapping(request): self.crwlr_stats.inc_value('splash/wrapped', spider=spider) return self._wrap_url(request) def process_response(self, request, response, spider): if request.meta.get('splashed_url', False): self.crwlr_stats.inc_value('splash/unwrapped', spider=spider) return self._unwrap_url(request, response) else: return response def _wrap_url(self, request): print request.url wrapped = w3lib.url.add_or_replace_parameter(self.endpoint, 'url', request.url) print wrapped print # pass options wrapped = w3lib.url.add_or_replace_parameter(wrapped, 'wait', self.wait) if self.viewport: wrapped = w3lib.url.add_or_replace_parameter(wrapped, 'viewport', self.viewport) wrapped = w3lib.url.add_or_replace_parameter(wrapped, 'images', 1 if self.images else 0) if self.js: wrapped = w3lib.url.add_or_replace_parameter(wrapped, 'js', self.js) if self.filters: wrapped = w3lib.url.add_or_replace_parameter(wrapped, 'filters', self.filters) return request.replace(url=wrapped, meta={"splashed_url": request.url}) def _unwrap_url(self, request, response): unwrapped = w3lib.url.url_query_parameter(request.url, 'url') response = response.replace(url=unwrapped) return response <!-- end snippet --> I am getting following output error : <!-- begin snippet: js hide: false --> <!-- language: lang-html --> [josh@dpitstsvr015 NorthshoreorgDemo]$ scrapy crawl northshore 2014-11-06 15:38:39+0530 [scrapy] INFO: Scrapy 0.24.4 started (bot: NorthshoreorgDemo) 2014-11-06 15:38:39+0530 [scrapy] INFO: Optional features available: ssl, http11 2014-11-06 15:38:39+0530 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'NorthshoreorgDemo.spiders', 'SPIDER_MODULES': ['NorthshoreorgDemo.spiders'], 'BOT_NAME': 'NorthshoreorgDemo'} 2014-11-06 15:38:39+0530 [scrapy] INFO: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState 2014-11-06 15:38:39+0530 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats 2014-11-06 15:38:39+0530 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware 2014-11-06 15:38:39+0530 [scrapy] INFO: Enabled item pipelines: 2014-11-06 15:38:39+0530 [northshore] INFO: Spider opened 2014-11-06 15:38:39+0530 [northshore] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 2014-11-06 15:38:39+0530 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023 2014-11-06 15:38:39+0530 [scrapy] DEBUG: Web service listening on 127.0.0.1:6080 2014-11-06 15:38:40+0530 [northshore] DEBUG: Redirecting (302) to <GET https://eapplicant.northshore.org/psc/psapp/?cmd=login&errorPg=ckreq&languageCd=ENG> from <GET https://eapplicant.northshore.org/psc/psapp/EMPLOYEE/HRMS/c/HRS_HRAM.HRS_CE.GBL?& > 2014-11-06 15:38:41+0530 [northshore] DEBUG: Crawled (200) <GET https://eapplicant.northshore.org/psc/psapp/?cmd=login&errorPg=ckreq&languageCd=ENG> (referer: None) 2014-11-06 15:38:41+0530 [northshore] ERROR: Spider error processing <GET https://eapplicant.northshore.org/psc/psapp/?cmd=login&errorPg=ckreq&languageCd=ENG > Traceback (most recent call last): File "/usr/lib64/python2.7/site-packages/twisted/internet/base.py", line 824, in runUntilCurrent call.func(*call.args, **call.kw) File "/usr/lib64/python2.7/site-packages/twisted/internet/task.py", line 638, in _tick taskObj._oneWorkUnit() File "/usr/lib64/python2.7/site-packages/twisted/internet/task.py", line 484, in _oneWorkUnit result = next(self._iterator) File "/usr/lib64/python2.7/site-packages/scrapy/utils/defer.py", line 57, in <genexpr> work = (callable(elem, *args, **named) for elem in iterable) --- <exception caught here> --- File "/usr/lib64/python2.7/site-packages/scrapy/utils/defer.py", line 96, in iter_errback yield next(it) File "/usr/lib64/python2.7/site-packages/scrapy/contrib/spidermiddleware/offsite.py", line 26, in process_spider_output for x in result: File "/usr/lib64/python2.7/site-packages/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr> return (_set_referer(r) for r in result or ()) File "/usr/lib64/python2.7/site-packages/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr> return (r for r in result or () if _filter(r)) File "/usr/lib64/python2.7/site-packages/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr> return (r for r in result or () if _filter(r)) File "/home/sureshp/Downloads/NorthshoreorgDemo/NorthshoreorgDemo/spiders/northshore.py", line 23, in parse for link in selector.css('div.divgbrHRS_CE_JO_EXT_I$0 tr.trHRS_CE_JO_EXT_I$0_row1 > a.title.heading.win0divPOSTINGTITLE -title::attr(href)').extract(): File "/usr/lib64/python2.7/site-packages/scrapy/selector/unified.py", line 110, in css return self.xpath(self._css2xpath(query)) File "/usr/lib64/python2.7/site-packages/scrapy/selector/unified.py", line 113, in _css2xpath return self._csstranslator.css_to_xpath(query) File "/usr/lib/python2.7/site-packages/cssselect/xpath.py", line 192, in css_to_xpath for selector in parse(css)) File "/usr/lib/python2.7/site-packages/cssselect/parser.py", line 355, in parse return list(parse_selector_group(stream)) File "/usr/lib/python2.7/site-packages/cssselect/parser.py", line 370, in parse_selector_group yield Selector(*parse_selector(stream)) File "/usr/lib/python2.7/site-packages/cssselect/parser.py", line 378, in parse_selector result, pseudo_element = parse_simple_selector(stream) File "/usr/lib/python2.7/site-packages/cssselect/parser.py", line 477, in parse_simple_selector "Expected selector, got %s" % (peek,)) cssselect.parser.SelectorSyntaxError: Expected selector, got <DELIM '$' at 25> 2014-11-06 15:38:41+0530 [northshore] INFO: Closing spider (finished) 2014-11-06 15:38:41+0530 [northshore] INFO: Dumping Scrapy stats: {'downloader/request_bytes': 629, 'downloader/request_count': 2, 'downloader/request_method_count/GET': 2, 'downloader/response_bytes': 4585, 'downloader/response_count': 2, 'downloader/response_status_count/200': 1, 'downloader/response_status_count/302': 1, 'finish_reason': 'finished', 'finish_time': datetime.datetime(2014, 11, 6, 10, 8, 41, 327224), 'log_count/DEBUG': 4, 'log_count/ERROR': 1, 'log_count/INFO': 7, 'response_received_count': 1, 'scheduler/dequeued': 2, 'scheduler/dequeued/memory': 2, 'scheduler/enqueued': 2, 'scheduler/enqueued/memory': 2, 'spider_exceptions/SelectorSyntaxError': 1, 'start_time': datetime.datetime(2014, 11, 6, 10, 8, 39, 399328)} 2014-11-06 15:38:41+0530 [northshore] INFO: Spider closed (finished) <!-- end snippet --> -- You received this message because you are subscribed to the Google Groups "scrapy-users" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. To post to this group, send email to [email protected]. Visit this group at http://groups.google.com/group/scrapy-users. For more options, visit https://groups.google.com/d/optout.
