this is very hard to jump in, can you host the html (as returned from splash) so we can use scrapy shell on it? in any case, i'm not sure the CSS selector engine supports .not(). try xpath.
Em quinta-feira, 6 de novembro de 2014 11h20min18s UTC-2, Bolt Clock escreveu: > > I practice Scrapy and want to ask a question: > > > ** > https://eapplicant.northshore.org/psc/psapp/EMPLOYEE/HRMS/c/HRS_HRAM.HRS_CE.GBL** > > **Please let me know how to select the css selector path for title, > description and next page link**. > > **Language : Python + scrapy + scrapinghub/splash** > > > Spider code : > > <!-- begin snippet: js hide: false --> > > <!-- language: lang-html --> > > import re > import urlparse > from scrapy.http.request import Request > from scrapy.spider import Spider > from scrapy.selector import Selector > from scrapy.item import Item, Field > > class NorthshoreSpiderItem(Item): > Title = Field() > Link = Field() > Description = Field() > Companyname = Field() > > > class NorthshoreSpider(Spider): > name = 'northshore' > allowed_domains = ['www.northshore.org'] > start_urls = [' > https://eapplicant.northshore.org/psc/psapp/EMPLOYEE/HRMS/c/HRS_HRAM.HRS_CE.GBL?& > '] > > def parse(self,response): > selector = Selector(response) > links = [] > for link in selector.css('div.divgbrHRS_ > CE_JO_EXT_I$0 tr.trHRS_CE_JO_EXT_I$0_row1 > > a.title.heading.win0divPOSTINGTITLE -title::attr(href)').extract(): > yield Request(urlparse.urljoin(response.url, link), > callback=self.parse_listing_page, > #meta={"use_splash": False} > ) > > next_page_link = selector.css('div.pages > > a:last-child:not(.disabled)') > if next_page_link: > def increment10(matchobj): > return "st="+str(int(matchobj.group("pagenum"))+10) > next_page_url = re.sub('st=(?P<pagenum>\d+)', > increment10, response.url) > print "next page:", next_page_url > yield Request(next_page_url, self.parse, > #meta={"use_splash": True}, > dont_filter=True) > > def parse_listing_page(self,response): > selector = Selector(response) > item=NorthshoreSpiderItem() > item['Companyname'] = "Northshore" > item ['Link'] = response.url > item['Title'] = > selector.xpath("//span[@id='HRS_JO_WRK_POSTING_TITLE$0']/text()").extract() > item['Description'] = > selector.xpath("string(//div[@id='win0divHRS_JO_PST_DSCR$0'])").extract() > yield item > > > > <!-- end snippet --> > > > > > Here splash code : > > <!-- begin snippet: js hide: false --> > > <!-- language: lang-html --> > > import re > > from scrapy import log, signals > from scrapy.http import Request > from scrapy.exceptions import NotConfigured > import w3lib.url > > _matches = lambda url, regexs: any((r.search(url) for r in regexs)) > > class SplashMiddleware(object): > > url = 'http://localhost:8050/render.html' > wait = 0.5 > url_pass = () > url_block = () > > _settings = [ > 'endpoint', > 'wait', > 'images', > 'js', > 'filters', > 'viewport', > ] > > def __init__(self, crawler): > self.crwlr_settings = crawler.settings > self.crwlr_stats = crawler.stats > > @classmethod > def from_crawler(cls, crawler): > if not crawler.settings.getbool('SPLASH_ENABLED'): > raise NotConfigured > o = cls(crawler) > crawler.signals.connect(o.open_spider, > signal=signals.spider_opened) > return o > > def open_spider(self, spider): > for k in self._settings: > setattr(self, k, self._get_setting_value(spider, > self.crwlr_settings, k)) > # check URL filters > url_pass = self._get_setting_value(spider, > self.crwlr_settings, 'url_pass') > if url_pass: > self.url_pass = [re.compile(x) for x in url_pass] > url_block = self._get_setting_value(spider, > self.crwlr_settings, 'url_block') > if url_block: > self.url_block = [re.compile(x) for x in url_block] > > def _get_setting_value(self, spider, settings, k): > o = getattr(self, k, None) > s = settings.get('SPLASH_' + k.upper(), o) > return getattr(spider, 'splash_' + k, s) > > def _needs_wrapping(self, request): > # already wrapped > if request.meta.get("splashed_url", False): > return False > > # force wrap or not > use_splash = request.meta.get("use_splash", None) > if use_splash is not None: > return use_splash == True > > # check URL regexes > if not self.url_pass and not self.url_block: > return False > if self.url_pass and not _matches(request.url, self.url_pass): > return False > if self.url_block and _matches(request.url, self.url_block): > return False > > return True > > def process_request(self, request, spider): > if self._needs_wrapping(request): > self.crwlr_stats.inc_value('splash/wrapped', spider=spider) > return self._wrap_url(request) > > def process_response(self, request, response, spider): > if request.meta.get('splashed_url', False): > self.crwlr_stats.inc_value('splash/unwrapped', > spider=spider) > return self._unwrap_url(request, response) > else: > return response > > def _wrap_url(self, request): > print request.url > wrapped = w3lib.url.add_or_replace_parameter(self.endpoint, > 'url', request.url) > print wrapped > print > > # pass options > wrapped = w3lib.url.add_or_replace_parameter(wrapped, 'wait', > self.wait) > if self.viewport: > wrapped = w3lib.url.add_or_replace_parameter(wrapped, > 'viewport', self.viewport) > wrapped = w3lib.url.add_or_replace_parameter(wrapped, > 'images', 1 if self.images else 0) > if self.js: > wrapped = w3lib.url.add_or_replace_parameter(wrapped, > 'js', self.js) > if self.filters: > wrapped = w3lib.url.add_or_replace_parameter(wrapped, > 'filters', self.filters) > > return request.replace(url=wrapped, meta={"splashed_url": > request.url}) > > def _unwrap_url(self, request, response): > unwrapped = w3lib.url.url_query_parameter(request.url, 'url') > response = response.replace(url=unwrapped) > return response > > <!-- end snippet --> > > > > I am getting following output error : > > > <!-- begin snippet: js hide: false --> > > <!-- language: lang-html --> > > [josh@dpitstsvr015 NorthshoreorgDemo]$ scrapy crawl northshore > 2014-11-06 15:38:39+0530 [scrapy] INFO: Scrapy 0.24.4 started (bot: > NorthshoreorgDemo) > 2014-11-06 15:38:39+0530 [scrapy] INFO: Optional features available: > ssl, http11 > 2014-11-06 15:38:39+0530 [scrapy] INFO: Overridden settings: > {'NEWSPIDER_MODULE': 'NorthshoreorgDemo.spiders', 'SPIDER_MODULES': > ['NorthshoreorgDemo.spiders'], 'BOT_NAME': 'NorthshoreorgDemo'} > 2014-11-06 15:38:39+0530 [scrapy] INFO: Enabled extensions: LogStats, > TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState > 2014-11-06 15:38:39+0530 [scrapy] INFO: Enabled downloader > middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, > UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, > MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, > CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats > 2014-11-06 15:38:39+0530 [scrapy] INFO: Enabled spider middlewares: > HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, > UrlLengthMiddleware, DepthMiddleware > 2014-11-06 15:38:39+0530 [scrapy] INFO: Enabled item pipelines: > 2014-11-06 15:38:39+0530 [northshore] INFO: Spider opened > 2014-11-06 15:38:39+0530 [northshore] INFO: Crawled 0 pages (at 0 > pages/min), scraped 0 items (at 0 items/min) > 2014-11-06 15:38:39+0530 [scrapy] DEBUG: Telnet console listening on > 127.0.0.1:6023 > 2014-11-06 15:38:39+0530 [scrapy] DEBUG: Web service listening on > 127.0.0.1:6080 > 2014-11-06 15:38:40+0530 [northshore] DEBUG: Redirecting (302) to <GET > https://eapplicant.northshore.org/psc/psapp/?cmd=login&errorPg=ckreq&languageCd=ENG> > > from <GET > https://eapplicant.northshore.org/psc/psapp/EMPLOYEE/HRMS/c/HRS_HRAM.HRS_CE.GBL?& > > > 2014-11-06 15:38:41+0530 [northshore] DEBUG: Crawled (200) <GET > https://eapplicant.northshore.org/psc/psapp/?cmd=login&errorPg=ckreq&languageCd=ENG> > > (referer: None) > 2014-11-06 15:38:41+0530 [northshore] ERROR: Spider error processing > <GET > https://eapplicant.northshore.org/psc/psapp/?cmd=login&errorPg=ckreq&languageCd=ENG > > > Traceback (most recent call last): > File > "/usr/lib64/python2.7/site-packages/twisted/internet/base.py", line 824, in > runUntilCurrent > call.func(*call.args, **call.kw) > File > "/usr/lib64/python2.7/site-packages/twisted/internet/task.py", line 638, in > _tick > taskObj._oneWorkUnit() > File > "/usr/lib64/python2.7/site-packages/twisted/internet/task.py", line 484, in > _oneWorkUnit > result = next(self._iterator) > File "/usr/lib64/python2.7/site-packages/scrapy/utils/defer.py", > line 57, in <genexpr> > work = (callable(elem, *args, **named) for elem in iterable) > --- <exception caught here> --- > File "/usr/lib64/python2.7/site-packages/scrapy/utils/defer.py", > line 96, in iter_errback > yield next(it) > File > "/usr/lib64/python2.7/site-packages/scrapy/contrib/spidermiddleware/offsite.py", > > line 26, in process_spider_output > for x in result: > File > "/usr/lib64/python2.7/site-packages/scrapy/contrib/spidermiddleware/referer.py", > > line 22, in <genexpr> > return (_set_referer(r) for r in result or ()) > File > "/usr/lib64/python2.7/site-packages/scrapy/contrib/spidermiddleware/urllength.py", > > line 33, in <genexpr> > return (r for r in result or () if _filter(r)) > File > "/usr/lib64/python2.7/site-packages/scrapy/contrib/spidermiddleware/depth.py", > > line 50, in <genexpr> > return (r for r in result or () if _filter(r)) > File > "/home/sureshp/Downloads/NorthshoreorgDemo/NorthshoreorgDemo/spiders/northshore.py", > > line 23, in parse > for link in selector.css('div.divgbrHRS_CE_JO_EXT_I$0 > tr.trHRS_CE_JO_EXT_I$0_row1 > a.title.heading.win0divPOSTINGTITLE > -title::attr(href)').extract(): > File > "/usr/lib64/python2.7/site-packages/scrapy/selector/unified.py", line 110, > in css > return self.xpath(self._css2xpath(query)) > File > "/usr/lib64/python2.7/site-packages/scrapy/selector/unified.py", line 113, > in _css2xpath > return self._csstranslator.css_to_xpath(query) > File "/usr/lib/python2.7/site-packages/cssselect/xpath.py", line > 192, in css_to_xpath > for selector in parse(css)) > File "/usr/lib/python2.7/site-packages/cssselect/parser.py", > line 355, in parse > return list(parse_selector_group(stream)) > File "/usr/lib/python2.7/site-packages/cssselect/parser.py", > line 370, in parse_selector_group > yield Selector(*parse_selector(stream)) > File "/usr/lib/python2.7/site-packages/cssselect/parser.py", > line 378, in parse_selector > result, pseudo_element = parse_simple_selector(stream) > File "/usr/lib/python2.7/site-packages/cssselect/parser.py", > line 477, in parse_simple_selector > "Expected selector, got %s" % (peek,)) > cssselect.parser.SelectorSyntaxError: Expected selector, got > <DELIM '$' at 25> > > 2014-11-06 15:38:41+0530 [northshore] INFO: Closing spider (finished) > 2014-11-06 15:38:41+0530 [northshore] INFO: Dumping Scrapy stats: > {'downloader/request_bytes': 629, > 'downloader/request_count': 2, > 'downloader/request_method_count/GET': 2, > 'downloader/response_bytes': 4585, > 'downloader/response_count': 2, > 'downloader/response_status_count/200': 1, > 'downloader/response_status_count/302': 1, > 'finish_reason': 'finished', > 'finish_time': datetime.datetime(2014, 11, 6, 10, 8, 41, 327224), > 'log_count/DEBUG': 4, > 'log_count/ERROR': 1, > 'log_count/INFO': 7, > 'response_received_count': 1, > 'scheduler/dequeued': 2, > 'scheduler/dequeued/memory': 2, > 'scheduler/enqueued': 2, > 'scheduler/enqueued/memory': 2, > 'spider_exceptions/SelectorSyntaxError': 1, > 'start_time': datetime.datetime(2014, 11, 6, 10, 8, 39, 399328)} > 2014-11-06 15:38:41+0530 [northshore] INFO: Spider closed (finished) > > > <!-- end snippet --> > > > -- You received this message because you are subscribed to the Google Groups "scrapy-users" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. To post to this group, send email to [email protected]. Visit this group at http://groups.google.com/group/scrapy-users. For more options, visit https://groups.google.com/d/optout.
