Hello everyone, As I am new in python and I am facing this error from long
time here I am attaching my code please go through it.
And the full error is:
File "/usr/bin/scrapy", line 9, in <module>
load_entry_point('Scrapy==0.24.4', 'console_scripts', 'scrapy')()
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 143, in
execute
_run_print_help(parser, _run_command, cmd, args, opts)
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 89, in
_run_print_help
func(*a, **kw)
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 150, in
_run_command
cmd.run(args, opts)
File "/usr/lib/pymodules/python2.7/scrapy/commands/crawl.py", line 58, in
run
spider = crawler.spiders.create(spname, **opts.spargs)
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 48, in
create
return spcls(**spider_kwargs)
File
"/home/ubuntu/Desktop/python/timesdirectory/timesdirectory/spiders/seleniumtdurls.py",
line 22, in __init__
self.driver = webdriver.Remote("http://127.0.0.1:4444/wd/hub",
webdriver.DesiredCapabilities.HTMLUNITWITHJS)
File
"/usr/local/lib/python2.7/dist-packages/selenium/webdriver/remote/webdriver.py",
line 73, in __init__
self.start_session(desired_capabilities, browser_profile)
File
"/usr/local/lib/python2.7/dist-packages/selenium/webdriver/remote/webdriver.py",
line 121, in start_session
'desiredCapabilities': desired_capabilities,
File
"/usr/local/lib/python2.7/dist-packages/selenium/webdriver/remote/webdriver.py",
line 171, in execute
response = self.command_executor.execute(driver_command, params)
File
"/usr/local/lib/python2.7/dist-packages/selenium/webdriver/remote/remote_connection.py",
line 349, in execute
return self._request(command_info[0], url, body=data)
File
"/usr/local/lib/python2.7/dist-packages/selenium/webdriver/remote/remote_connection.py",
line 417, in _request
resp = opener.open(request)
File "/usr/lib/python2.7/urllib2.py", line 404, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 422, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1214, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1184, in do_open
raise URLError(err)
urllib2.URLError: <urlopen error [Errno 111] Connection refused>
Hope for good response.
Thanks
Charu
--
You received this message because you are subscribed to the Google Groups
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.spider import BaseSpider
from timesdirectory.items import TimesdirectoryItem
from selenium import webdriver
class SeleniumtdurlsSpider(scrapy.Spider):
name = "tdurls"
allowed_domains = ["timesbusinessdirectory.com"]
#start_urls = (
# 'http://www.timesbusinessdirectory.com/',
#)
start_urls = ['http://www.timesbusinessdirectory.com/CompanyListings_MG.aspx?DirID=187&name=Company+Listings&mid=1276']
def __init__(self):
self.driver = webdriver.Remote("http://127.0.0.1:4444/wd/hub", webdriver.DesiredCapabilities.HTMLUNITWITHJS)
def parse(self, response):
self.driver.get(response.url)
self.driver.implicitly_wait(10)
hxs = Selector(response)
item = TimesdirectoryItem()
finalurls = []
while True:
next = self.driver.find_element_by_xpath('.//table[@id="dgrdCompany"]/tr[12]/td/a')
print "------------------next---------------",next
try:
next.click()
item['page'] = response.url
urls = self.driver.find_elements_by_xpath('.//table[@id="dgrdCompany"]/tr/td/table/tr/td[1]/table/tr/td/a')
print "===============urls==============",urls
for url in urls:
url = url.get_attribute("href")
print "........................url..................",url
finalurls.append(url)
item['urls'] = finalurls
except:
break
self.driver.close()
return item
#pass
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy.item import Item,Field
class TimesdirectoryItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
page = Field()
urls = Field()
pageurls = Field()
title = Field()
basic_info = Field()
content_info = Field()
categories = Field()
pass