<http://stackoverflow.com/questions/26709432/my-script-is-pulling-low-jobs-count-please-let-me-know-how-to-crawl-all-jobs-us#> I am very new to "scrapy", i am scrapping a website and in that i had some anchor tags which consists of href attributes with href="javascript:void(0) functions. When i clicked that javascript function a page is opening from which i need to fetch data.I used Xpath and found href for particular anchor tags but unable to execute that href attribute that contains javascript function. Can anyone tell me how to execute href="javascript:void(0) of anchor tags in scrapy python.My HTML code is
NOte : I am currently instlled centos + python + scrapy + splash and I have try to get output from this link http://www.northshore.org/careers/ Please click on the search & apply link then it will be open job list page. I have to get all jobs from this link and currently iam getting 4 jobs count only. Its not scrap all jobs. Please let me know how to scrapy all jobs count. Also please check is there any mistake in my script Here i am posted my spider script: import re import urlparse from scrapy.http.request import Request from scrapy.spider import Spider from scrapy.selector import Selector from scrapy.item import Item, Field class googleSpiderItem(Item): Title = Field() JobDetailUrl = Field() Description = Field() CompanyName = Field() class googleSpider(Spider): name = 'northshore' allowed_domains = ['northshore.org'] start_urls = ['https://eapplicant.northshore.org/psc/psapp/EMPLOYEE/HRMS/c/HRS_HRAM.HRS_CE.GBL?&'] def parse(self,response): selector = Selector(response) links = [] for link in selector.css('div.srs div.sr-content > a.title.heading.sr-title::attr(href)').extract(): yield Request(urlparse.urljoin(response.url, link), callback=self.parse_listing_page, #meta={"use_splash": False} ) next_page_link = selector.css('div.pages > a:last-child:not(.disabled)') if next_page_link: def increment10(matchobj): return "st="+str(int(matchobj.group("pagenum"))+10) next_page_url = re.sub('st=(?P<pagenum>\d+)', increment10, response.url) print "next page:", next_page_url yield Request(next_page_url, self.parse, #meta={"use_splash": True}, dont_filter=True) def parse_listing_page(self,response): selector = Selector(response) item=googleSpiderItem() item['CompanyName'] = "Google" item ['JobDetailUrl'] = response.url item['Title'] = selector.xpath("//a[@class='heading detail-title']/span[@itemprop='name title']/text()").extract() item['Description'] = selector.xpath("string(//div[@itemprop='description'])").extract() yield item Here my splash code : import re from scrapy import log, signals from scrapy.http import Request from scrapy.exceptions import NotConfigured import w3lib.url _matches = lambda url, regexs: any((r.search(url) for r in regexs)) class SplashMiddleware(object): url = 'http://localhost:8050/render.html' wait = 0.5 url_pass = () url_block = () _settings = [ 'endpoint', 'wait', 'images', 'js', 'filters', 'viewport', ] def __init__(self, crawler): self.crwlr_settings = crawler.settings self.crwlr_stats = crawler.stats @classmethod def from_crawler(cls, crawler): if not crawler.settings.getbool('SPLASH_ENABLED'): raise NotConfigured o = cls(crawler) crawler.signals.connect(o.open_spider, signal=signals.spider_opened) return o def open_spider(self, spider): for k in self._settings: setattr(self, k, self._get_setting_value(spider, self.crwlr_settings, k)) # check URL filters url_pass = self._get_setting_value(spider, self.crwlr_settings, 'url_pass') if url_pass: self.url_pass = [re.compile(x) for x in url_pass] url_block = self._get_setting_value(spider, self.crwlr_settings, 'url_block') if url_block: self.url_block = [re.compile(x) for x in url_block] def _get_setting_value(self, spider, settings, k): o = getattr(self, k, None) s = settings.get('SPLASH_' + k.upper(), o) return getattr(spider, 'splash_' + k, s) def _needs_wrapping(self, request): # already wrapped if request.meta.get("splashed_url", False): return False # force wrap or not use_splash = request.meta.get("use_splash", None) if use_splash is not None: return use_splash == True # check URL regexes if not self.url_pass and not self.url_block: return False if self.url_pass and not _matches(request.url, self.url_pass): return False if self.url_block and _matches(request.url, self.url_block): return False return True def process_request(self, request, spider): if self._needs_wrapping(request): self.crwlr_stats.inc_value('splash/wrapped', spider=spider) return self._wrap_url(request) def process_response(self, request, response, spider): if request.meta.get('splashed_url', False): self.crwlr_stats.inc_value('splash/unwrapped', spider=spider) return self._unwrap_url(request, response) else: return response def _wrap_url(self, request): print request.url wrapped = w3lib.url.add_or_replace_parameter(self.endpoint, 'url', request.url) print wrapped print # pass options wrapped = w3lib.url.add_or_replace_parameter(wrapped, 'wait', self.wait) if self.viewport: wrapped = w3lib.url.add_or_replace_parameter(wrapped, 'viewport', self.viewport) wrapped = w3lib.url.add_or_replace_parameter(wrapped, 'images', 1 if self.images else 0) if self.js: wrapped = w3lib.url.add_or_replace_parameter(wrapped, 'js', self.js) if self.filters: wrapped = w3lib.url.add_or_replace_parameter(wrapped, 'filters', self.filters) return request.replace(url=wrapped, meta={"splashed_url": request.url}) def _unwrap_url(self, request, response): unwrapped = w3lib.url.url_query_parameter(request.url, 'url') response = response.replace(url=unwrapped) return response Here my ouptu : 2014-11-03 12:46:55+0530 [scrapy] DEBUG: Web service listening on 127.0.0.1:6080 2014-11-03 12:46:56+0530 [northshore] DEBUG: Redirecting (302) to <GET https://eapplicant.northshore.org/psc/psapp/?cmd=login&errorPg=ckreq&languageCd=ENG> from <GET https://eapplicant.northshore.org/psc/psapp/EMPLOYEE/HRMS/c/HRS_HRAM.HRS_CE.GBL?&> 2014-11-03 12:46:57+0530 [northshore] DEBUG: Crawled (200) <GET https://eapplicant.northshore.org/psc/psapp/?cmd=login&errorPg=ckreq&languageCd=ENG> (referer: None) 2014-11-03 12:46:57+0530 [northshore] INFO: Closing spider (finished) 2014-11-03 12:46:57+0530 [northshore] INFO: Dumping Scrapy stats:{'downloader/request_bytes': 629, 'downloader/request_count': 2, 'downloader/request_method_count/GET': 2, 'downloader/response_bytes': 4585, 'downloader/response_count': 2, 'downloader/response_status_count/200': 1, 'downloader/response_status_count/302': 1, 'finish_reason': 'finished', 'finish_time': datetime.datetime(2014, 11, 3, 7, 16, 57, 272612), 'log_count/DEBUG': 4, 'log_count/INFO': 7, 'response_received_count': 1, 'scheduler/dequeued': 2, 'scheduler/dequeued/memory': 2, 'scheduler/enqueued': 2, 'scheduler/enqueued/memory': 2, 'start_time': datetime.datetime(2014, 11, 3, 7, 16, 55, 871319)} 2014-11-03 12:46:57+0530 [northshore] INFO: Spider closed (finished) -- You received this message because you are subscribed to the Google Groups "scrapy-users" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. To post to this group, send email to [email protected]. Visit this group at http://groups.google.com/group/scrapy-users. For more options, visit https://groups.google.com/d/optout.
