Ok, I'm at a stopping point as I've been at this going on 24 hours, cross-eyed requiring rest. I think I'm close but no cookie yet, stupid error on my part but I'm going to post what I've got so far. As I'm making use of scrapyjs...
bash-3.2$ *docker run -p 8050:8050 scrapinghub/splash* *settings.py* # -*- coding: utf-8 -*- # Scrapy settings for tm project # # For simplicity, this file contains only the most important settings by # default. All the other settings are documented here: # # http://doc.scrapy.org/en/latest/topics/settings.html # BOT_NAME = 'tm' SPIDER_MODULES = ['tm.spiders'] NEWSPIDER_MODULE = 'tm.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'tm (+http://www.yourdomain.com)' SPLASH_URL = 'http://192.168.59.103:8050' DOWNLOADER_MIDDLEWARES = { 'scrapyjs.SplashMiddleware': 725, } DUPEFILTER_CLASS = 'scrapyjs.SplashAwareDupeFilter' HTTPCACHE_STORAGE = 'scrapyjs.SplashAwareFSCacheStorage' *tm_spider.py* import json import scrapy from scrapy.http.headers import Headers RENDER_HTML_URL = "http://127.0.0.1:8050/render.html" class TmSpider(scrapy.Spider): name = "tm" allowed_domains = ["tick***aster.com"] start_urls = ["http://www.tick***aster.com"] def start_requests(self): for url in self.start_urls: yield scrapy.Request(url, self.parse, meta={ 'splash': { 'endpoint': 'render.html', 'args': {'wait': 0.5} } }) def parse(self, response): yield Request(url, self.parse_result, meta={ 'splash': { 'args': { # set rendering arguments here 'html': 1, 'png': 1, # 'url' is prefilled from request url }, # optional parameters 'endpoint': 'render.json', # optional; default is render.json 'splash_url': '<url>', # overrides SPLASH_URL 'slot_policy': scrapyjs.SlotPolicy.PER_DOMAIN, } }) scrapy crawl tm pawnbahnimac:spiders pawnbahn$ scrapy crawl tm :0: UserWarning: You do not have a working installation of the service_identity module: 'No module named service_identity'. Please install it from <https://pypi.python.org/pypi/service_identity> and make sure all of its dependencies are satisfied. Without the service_identity module and a recent enough pyOpenSSL to support it, Twisted can perform only rudimentary TLS client hostname verification. Many valid certificate/hostname mappings may be rejected. 2015-04-02 17:16:21-0500 [scrapy] INFO: Scrapy 0.24.5 started (bot: tm) 2015-04-02 17:16:21-0500 [scrapy] INFO: Optional features available: ssl, http11 2015-04-02 17:16:21-0500 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'tm.spiders', 'SPIDER_MODULES': ['tm.spiders'], 'DUPEFILTER_CLASS': 'scrapyjs.SplashAwareDupeFilter', 'HTTPCACHE_STORAGE': 'scrapyjs.SplashAwareFSCacheStorage', 'BOT_NAME': 'tm'} 2015-04-02 17:16:21-0500 [scrapy] INFO: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState 2015-04-02 17:16:21-0500 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, SplashMiddleware, ChunkedTransferMiddleware, DownloaderStats 2015-04-02 17:16:21-0500 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware 2015-04-02 17:16:21-0500 [scrapy] INFO: Enabled item pipelines: 2015-04-02 17:16:21-0500 [tm] INFO: Spider opened 2015-04-02 17:16:21-0500 [tm] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 2015-04-02 17:16:21-0500 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023 2015-04-02 17:16:21-0500 [scrapy] DEBUG: Web service listening on 127.0.0.1:6080 2015-04-02 17:16:23-0500 [tm] DEBUG: Crawled (200) <POST http://192.168.59.103:8050/render.html> (referer: None) 2015-04-02 17:16:23-0500 [tm] ERROR: Spider error processing <POST http://192.168.59.103:8050/render.html> Traceback (most recent call last): File "/usr/local/lib/python2.7/site-packages/twisted/internet/base.py", line 824, in runUntilCurrent call.func(*call.args, **call.kw) File "/usr/local/lib/python2.7/site-packages/twisted/internet/task.py", line 638, in _tick taskObj._oneWorkUnit() File "/usr/local/lib/python2.7/site-packages/twisted/internet/task.py", line 484, in _oneWorkUnit result = next(self._iterator) File "/usr/local/lib/python2.7/site-packages/scrapy/utils/defer.py", line 57, in <genexpr> work = (callable(elem, *args, **named) for elem in iterable) --- <exception caught here> --- File "/usr/local/lib/python2.7/site-packages/scrapy/utils/defer.py", line 96, in iter_errback yield next(it) File "/usr/local/lib/python2.7/site-packages/scrapy/contrib/spidermiddleware/offsite.py", line 26, in process_spider_output for x in result: File "/usr/local/lib/python2.7/site-packages/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr> return (_set_referer(r) for r in result or ()) File "/usr/local/lib/python2.7/site-packages/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr> return (r for r in result or () if _filter(r)) File "/usr/local/lib/python2.7/site-packages/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr> return (r for r in result or () if _filter(r)) File "/Users/pawnbahn/tm/tm/spiders/tm_spider.py", line 23, in parse yield Request(url, self.parse_result, meta={ exceptions.NameError: global name 'Request' is not defined 2015-04-02 17:16:23-0500 [tm] INFO: Closing spider (finished) 2015-04-02 17:16:23-0500 [tm] INFO: Dumping Scrapy stats: {'downloader/request_bytes': 308, 'downloader/request_count': 1, 'downloader/request_method_count/POST': 1, 'downloader/response_bytes': 470190, 'downloader/response_count': 1, 'downloader/response_status_count/200': 1, 'finish_reason': 'finished', 'finish_time': datetime.datetime(2015, 4, 2, 22, 16, 23, 234286), 'log_count/DEBUG': 3, 'log_count/ERROR': 1, 'log_count/INFO': 7, 'response_received_count': 1, 'scheduler/dequeued': 2, 'scheduler/dequeued/memory': 2, 'scheduler/enqueued': 2, 'scheduler/enqueued/memory': 2, 'spider_exceptions/NameError': 1, 'splash/render.html/request_count': 1, 'splash/render.html/response_count/200': 1, 'start_time': datetime.datetime(2015, 4, 2, 22, 16, 21, 216878)} 2015-04-02 17:16:23-0500 [tm] INFO: Spider closed (finished) On Thursday, April 2, 2015 at 4:13:11 PM UTC-5, Troy Perkins wrote: > > This looks promising, I'll let everyone know if it works. > https://github.com/scrapinghub/scrapyjs > > On Thursday, April 2, 2015 at 3:48:06 PM UTC-5, Troy Perkins wrote: >> >> I've been Googling around all day on how to scrape a javascript page with >> scrapy. I think thats the issue. From what I've found Scrapy doesn't >> support parsing javascript and making use of Selenium is the only >> workaround. Thats too much overhead for what I'm wanting to do... oh well. >> Hoping to find another solution. Thanks for your help Travis, it was >> greatly appreciated. >> >> On Thursday, April 2, 2015 at 3:03:37 PM UTC-5, Travis Leleu wrote: >>> >>> Your recent debug output doesn't have that error, so you must have fixed >>> it. >>> >>> The current error feels like it's either a javascript-loaded page, or >>> you're getting blocked from scraping by the server. >>> >>> Google around for how to scrape a javscript page with scrapy, and using >>> a proxy. Those guides will be your friend. >>> >>> On Thu, Apr 2, 2015 at 12:58 PM, Troy Perkins <[email protected]> >>> wrote: >>> >>>> Hi Travis, thanks for the response. Not sure why its not able to find >>>> it, its there, see below: >>>> >>>> pawnbahnimac:spiders pawnbahn$ pwd >>>> /Users/pawnbahn/tm/tm/spiders >>>> pawnbahnimac:spiders pawnbahn$ ls >>>> Books Resources __init__.py __init__.pyc items.json tm_spider.py >>>> tm_spider.pyc >>>> pawnbahnimac:spiders pawnbahn$ >>>> >>>> It only behave like this on this site for some reason. Running the >>>> dmoz example works fine. >>>> >>>> pawnbahnimac:spiders pawnbahn$ scrapy crawl tm >>>> :0: UserWarning: You do not have a working installation of the >>>> service_identity module: 'No module named service_identity'. Please >>>> install it from <https://pypi.python.org/pypi/service_identity> and >>>> make sure all of its dependencies are satisfied. Without the >>>> service_identity module and a recent enough pyOpenSSL to support it, >>>> Twisted can perform only rudimentary TLS client hostname verification. >>>> Many valid certificate/hostname mappings may be rejected. >>>> 2015-04-02 14:56:01-0500 [scrapy] INFO: Scrapy 0.24.5 started (bot: tm) >>>> 2015-04-02 14:56:01-0500 [scrapy] INFO: Optional features available: >>>> ssl, http11 >>>> 2015-04-02 14:56:01-0500 [scrapy] INFO: Overridden settings: >>>> {'NEWSPIDER_MODULE': 'tm.spiders', 'SPIDER_MODULES': ['tm.spiders'], >>>> 'BOT_NAME': 'tm'} >>>> 2015-04-02 14:56:01-0500 [scrapy] INFO: Enabled extensions: LogStats, >>>> TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState >>>> 2015-04-02 14:56:01-0500 [scrapy] INFO: Enabled downloader middlewares: >>>> HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, >>>> RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, >>>> HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, >>>> ChunkedTransferMiddleware, DownloaderStats >>>> 2015-04-02 14:56:01-0500 [scrapy] INFO: Enabled spider middlewares: >>>> HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, >>>> UrlLengthMiddleware, DepthMiddleware >>>> 2015-04-02 14:56:01-0500 [scrapy] INFO: Enabled item pipelines: >>>> 2015-04-02 14:56:01-0500 [tm] INFO: Spider opened >>>> 2015-04-02 14:56:01-0500 [tm] INFO: Crawled 0 pages (at 0 pages/min), >>>> scraped 0 items (at 0 items/min) >>>> 2015-04-02 14:56:01-0500 [scrapy] DEBUG: Telnet console listening on >>>> 127.0.0.1:6023 >>>> 2015-04-02 14:56:01-0500 [scrapy] DEBUG: Web service listening on >>>> 127.0.0.1:6080 >>>> 2015-04-02 14:56:01-0500 [tm] DEBUG: Crawled (200) <GET >>>> http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/> >>>> (referer: None) >>>> 2015-04-02 14:56:01-0500 [tm] INFO: Closing spider (finished) >>>> 2015-04-02 14:56:01-0500 [tm] INFO: Dumping Scrapy stats: >>>> {'downloader/request_bytes': 260, >>>> 'downloader/request_count': 1, >>>> 'downloader/request_method_count/GET': 1, >>>> 'downloader/response_bytes': 6234, >>>> 'downloader/response_count': 1, >>>> 'downloader/response_status_count/200': 1, >>>> 'finish_reason': 'finished', >>>> 'finish_time': datetime.datetime(2015, 4, 2, 19, 56, 1, 861714), >>>> 'log_count/DEBUG': 3, >>>> 'log_count/INFO': 7, >>>> 'response_received_count': 1, >>>> 'scheduler/dequeued': 1, >>>> 'scheduler/dequeued/memory': 1, >>>> 'scheduler/enqueued': 1, >>>> 'scheduler/enqueued/memory': 1, >>>> 'start_time': datetime.datetime(2015, 4, 2, 19, 56, 1, 494696)} >>>> 2015-04-02 14:56:01-0500 [tm] INFO: Spider closed (finished) >>>> >>>> >>>> >>>> On Thursday, April 2, 2015 at 11:30:41 AM UTC-5, Travis Leleu wrote: >>>>> >>>>> Python can't find the file whose path is stored in filename. Used in >>>>> line 13 of your spider. Read your scrapy debug output to find out more >>>>> information. >>>>> >>>>> File "/Users/pawnbahn/tm/tm/spiders/tm_spider.py", line 13, in parse >>>>> with open(filename, 'wb') as f: >>>>> exceptions.IOError: [Errno 2] No such file or directory: '' >>>>> >>>>> On Wed, Apr 1, 2015 at 10:38 PM, Troy Perkins <[email protected]> >>>>> wrote: >>>>> >>>>>> Greetings all: >>>>>> >>>>>> I'm new to scrapy and managed to get everything installed and >>>>>> working. However my simple test project has proven not so simple, at >>>>>> least >>>>>> for me. >>>>>> >>>>>> I'm simply wanting to request the home page of t 1 c k e t m a s t e >>>>>> r d o t c o m, click the red Just Announced tab down the middle of the >>>>>> page >>>>>> and -o the list of results out to an email address once a day via cron. >>>>>> I >>>>>> want to be able to keep up with the announcements because their mailing >>>>>> lists simply don't send them soon enough. >>>>>> >>>>>> Here is my starting spider, which I've tested with other sites and >>>>>> its works fine. I believe the error is due to it being a javascript >>>>>> rendered site. I've used firebug to look for clues but I'm too new at >>>>>> this >>>>>> to understand as well as understand javascript. I'm hoping someone >>>>>> would >>>>>> be willing to point this noob a direction. I've also tried removing >>>>>> middleware in the settings.py file with same results. >>>>>> >>>>>> I've purposely masked out the site address as though I don't mean any >>>>>> harm, I'm not quite sure of their ToS as of yet. I plan to poll once a >>>>>> day >>>>>> anyway for personal use. >>>>>> >>>>>> import scrapy >>>>>> >>>>>> from tm.items import TmItem >>>>>> >>>>>> class TmSpider(scrapy.Spider): >>>>>> name = "tm" >>>>>> allowed_domains = ["www.************.com"] >>>>>> start_urls = [ >>>>>> "http://www.***********.com" >>>>>> ] >>>>>> def parse(self, response): >>>>>> filename = response.url.split("/")[-2] >>>>>> with open(filename, 'wb') as f: >>>>>> f.write(response.body) >>>>>> >>>>>> scrapy crawl tm results in the following: >>>>>> >>>>>> :0: UserWarning: You do not have a working installation of the >>>>>> service_identity module: 'No module named service_identity'. Please >>>>>> install it from <https://pypi.python.org/pypi/service_identity> and >>>>>> make sure all of its dependencies are satisfied. Without the >>>>>> service_identity module and a recent enough pyOpenSSL to support it, >>>>>> Twisted can perform only rudimentary TLS client hostname verification. >>>>>> Many valid certificate/hostname mappings may be rejected. >>>>>> 2015-04-02 00:30:12-0500 [scrapy] INFO: Scrapy 0.24.5 started (bot: >>>>>> tm) >>>>>> 2015-04-02 00:30:12-0500 [scrapy] INFO: Optional features available: >>>>>> ssl, http11 >>>>>> 2015-04-02 00:30:12-0500 [scrapy] INFO: Overridden settings: >>>>>> {'NEWSPIDER_MODULE': 'tm.spiders', 'SPIDER_MODULES': ['tm.spiders'], >>>>>> 'BOT_NAME': 'tm'} >>>>>> 2015-04-02 00:30:12-0500 [scrapy] INFO: Enabled extensions: LogStats, >>>>>> TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState >>>>>> 2015-04-02 00:30:12-0500 [scrapy] INFO: Enabled downloader >>>>>> middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, >>>>>> UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, >>>>>> MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, >>>>>> CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats >>>>>> 2015-04-02 00:30:12-0500 [scrapy] INFO: Enabled spider middlewares: >>>>>> HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, >>>>>> UrlLengthMiddleware, DepthMiddleware >>>>>> 2015-04-02 00:30:12-0500 [scrapy] INFO: Enabled item pipelines: >>>>>> 2015-04-02 00:30:12-0500 [tm] INFO: Spider opened >>>>>> 2015-04-02 00:30:12-0500 [tm] INFO: Crawled 0 pages (at 0 pages/min), >>>>>> scraped 0 items (at 0 items/min) >>>>>> 2015-04-02 00:30:12-0500 [scrapy] DEBUG: Telnet console listening on >>>>>> 127.0.0.1:6023 >>>>>> 2015-04-02 00:30:12-0500 [scrapy] DEBUG: Web service listening on >>>>>> 127.0.0.1:6080 >>>>>> 2015-04-02 00:30:13-0500 [tm] DEBUG: Crawled (200) <GET http://www. >>>>>> ****************com> (referer: None) >>>>>> 2015-04-02 00:30:13-0500 [tm] ERROR: Spider error processing <GET >>>>>> http://www.****************.com> >>>>>> Traceback (most recent call last): >>>>>> File >>>>>> "/usr/local/lib/python2.7/site-packages/twisted/internet/base.py", >>>>>> line 1201, in mainLoop >>>>>> self.runUntilCurrent() >>>>>> File >>>>>> "/usr/local/lib/python2.7/site-packages/twisted/internet/base.py", >>>>>> line 824, in runUntilCurrent >>>>>> call.func(*call.args, **call.kw) >>>>>> File >>>>>> "/usr/local/lib/python2.7/site-packages/twisted/internet/defer.py", >>>>>> line 383, in callback >>>>>> self._startRunCallbacks(result) >>>>>> File >>>>>> "/usr/local/lib/python2.7/site-packages/twisted/internet/defer.py", >>>>>> line 491, in _startRunCallbacks >>>>>> self._runCallbacks() >>>>>> --- <exception caught here> --- >>>>>> File >>>>>> "/usr/local/lib/python2.7/site-packages/twisted/internet/defer.py", >>>>>> line 578, in _runCallbacks >>>>>> current.result = callback(current.result, *args, **kw) >>>>>> File "/Users/pawnbahn/tm/tm/spiders/tm_spider.py", line 13, in >>>>>> parse >>>>>> with open(filename, 'wb') as f: >>>>>> exceptions.IOError: [Errno 2] No such file or directory: '' >>>>>> 2015-04-02 00:30:13-0500 [tm] INFO: Closing spider (finished) >>>>>> 2015-04-02 00:30:13-0500 [tm] INFO: Dumping Scrapy stats: >>>>>> {'downloader/request_bytes': 219, >>>>>> 'downloader/request_count': 1, >>>>>> 'downloader/request_method_count/GET': 1, >>>>>> 'downloader/response_bytes': 73266, >>>>>> 'downloader/response_count': 1, >>>>>> 'downloader/response_status_count/200': 1, >>>>>> 'finish_reason': 'finished', >>>>>> 'finish_time': datetime.datetime(2015, 4, 2, 5, 30, 13, 3001), >>>>>> 'log_count/DEBUG': 3, >>>>>> 'log_count/ERROR': 1, >>>>>> 'log_count/INFO': 7, >>>>>> 'response_received_count': 1, >>>>>> 'scheduler/dequeued': 1, >>>>>> 'scheduler/dequeued/memory': 1, >>>>>> 'scheduler/enqueued': 1, >>>>>> 'scheduler/enqueued/memory': 1, >>>>>> 'spider_exceptions/IOError': 1, >>>>>> 'start_time': datetime.datetime(2015, 4, 2, 5, 30, 12, 344868)} >>>>>> 2015-04-02 00:30:13-0500 [tm] INFO: Spider closed (finished) >>>>>> >>>>>> >>>>>> -- >>>>>> You received this message because you are subscribed to the Google >>>>>> Groups "scrapy-users" group. >>>>>> To unsubscribe from this group and stop receiving emails from it, >>>>>> send an email to [email protected]. >>>>>> To post to this group, send email to [email protected]. >>>>>> Visit this group at http://groups.google.com/group/scrapy-users. >>>>>> For more options, visit https://groups.google.com/d/optout. >>>>>> >>>>> >>>>> -- >>>> You received this message because you are subscribed to the Google >>>> Groups "scrapy-users" group. >>>> To unsubscribe from this group and stop receiving emails from it, send >>>> an email to [email protected]. >>>> To post to this group, send email to [email protected]. >>>> Visit this group at http://groups.google.com/group/scrapy-users. >>>> For more options, visit https://groups.google.com/d/optout. >>>> >>> >>> -- You received this message because you are subscribed to the Google Groups "scrapy-users" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. To post to this group, send email to [email protected]. Visit this group at http://groups.google.com/group/scrapy-users. For more options, visit https://groups.google.com/d/optout.
