Following code will do the job. Hope it helps.

import re
import sys
import unicodedata
from string import join
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from pagitest.items import PagitestItem
from urlparse import urlparse
from urlparse import urljoin
class InfojobsSpider(CrawlSpider):
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; rv:29.0) Gecko/20100101 
Firefox/29.0"
name = "info"
allowed_domains = ["infosec.co.uk"]
start_urls = [
"http://www.infosec.co.uk/exhibitor-directory/";
]
rules = (
Rule(SgmlLinkExtractor(allow=(r'/en/exhibitor-directory/?startRecord=\d+&rpp=\d+')),
 
callback='parse_item', follow=True),
)
def parse_item(self, response):
items=[]
hxs = HtmlXPathSelector(response)
links = hxs.select('//div[@class="listItemDetail 
exhibitorDetail"]/h3[@class="name"]/a')
for lnk in links:
link = lnk.select('@href').extract()
if not re.match('^http', link):
link="http://www.infosec.co.uk"+link
yield Request(url=link, meta={"sitename": lnk.select('text()
').extract()}, callback=self.getwebsitename)


def getwebsitename(self, response):
websitename = response.meta['sitename']
hxs= HtmlXPathSelector(response)
websites= hxs.select('//li[@class="web"]/a/@href').extract()
## it's better to return PagitestItem instances instead of dict:
return {"sitename": websitename, "links": websites}

Четвер, 17 квітня 2014 р. 07:50:58 UTC+3 користувач masroor javed написав:
>
> yes i know but these links will be extract with simple xpath expression.
> i just want to hit all these links and get the website name and again come 
> back to first page to get the link name and stand name.
> Means:
> first page have 12 links so i have to extract each link name and stand 
> name then i have to hit one by one link and get website name.
> titlename, standname and websitename.
> I have attached image in which i described the titlename,standname.
>
>
>
> On Thu, Apr 17, 2014 at 3:04 AM, Svyatoslav Sydorenko <
> [email protected] <javascript:>> wrote:
>
>> Then just yield a new Request instead of returning url.
>>
>> BWT, You also should avoid double loop. It's possible to extract all 
>> links with single XPath expression
>> //div[@class="listItemDetail exhibitorDetail"]/h3[@class="name"]/a/@href
>>
>> P.S. If I understand you right you may also let scrapy crawl all links 
>> itself and not implement
>>
>> Середа, 16 квітня 2014 р. 12:34:19 UTC+3 користувач masroor javed написав:
>>>
>>> Hi Svyatoslav i just want to return all the website name from 
>>> getwebsitename function to yield Request(url=titleurls,
>>> callback=self.getwebsitename)
>>>
>>>
>>>  On Wed, Apr 16, 2014 at 2:22 PM, Svyatoslav Sydorenko <
>>> [email protected]> wrote:
>>>
>>>>  
>>>> - yield Request(url=titleurls,callback=self.getwebsitename)
>>>> + yield Request(url=titleurls, meta={"titlename": some_titlename, 
>>>> "standnumber": some_standnumber}, callback=self.getwebsitename)
>>>>
>>>> and in getwebsitename you may just access response.meta dict.
>>>> http://doc.scrapy.org/en/latest/topics/request-
>>>> response.html?highlight=meta#scrapy.http.Response.meta
>>>>
>>>> Вівторок, 15 квітня 2014 р. 14:14:32 UTC+3 користувач masroor javed 
>>>> написав:
>>>>
>>>>> Hi,
>>>>>
>>>>> I am new here in scrapy.
>>>>> I just want to know how to call a function and pass the two or three 
>>>>> value in return.
>>>>> I have a spider code please let me know how to solve it.
>>>>>
>>>>> Step:
>>>>> 1. i want to scrap all page links with pagination and and stand number.
>>>>> 2. hit all the links and want to extract website url
>>>>> 3. Total value should b 3 means titlename, standnumber and website url.
>>>>>
>>>>> my spider code is
>>>>>
>>>>> import re
>>>>> import sys
>>>>> import unicodedata
>>>>> from string import join
>>>>> from scrapy.contrib.spiders import CrawlSpider, Rule
>>>>> from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
>>>>> from scrapy.selector import HtmlXPathSelector
>>>>> from scrapy.http import Request
>>>>> from pagitest.items import PagitestItem
>>>>> from urlparse import urlparse
>>>>> from urlparse import urljoin
>>>>> class InfojobsSpider(CrawlSpider):
>>>>> USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; rv:29.0) Gecko/20100101 
>>>>> Firefox/29.0"
>>>>>  name = "info"
>>>>> allowed_domains = ["infosec.co.uk"]
>>>>>  start_urls = [
>>>>> "http://www.infosec.co.uk/exhibitor-directory/";
>>>>>  ]
>>>>> rules = (
>>>>> Rule(SgmlLinkExtractor(allow=(r'exhibitor\W+directory'),rest
>>>>> rict_xpaths=('//li[@class="gButton"]/a')), callback='parse_item', 
>>>>> follow=True),
>>>>>  )
>>>>> def parse_item(self, response):
>>>>> items=[]
>>>>> hxs = HtmlXPathSelector(response)
>>>>>  data = hxs.select('//div[@class="listItemDetail exhibitorDetail"]')
>>>>> for titlename in data:
>>>>>  titleurl=titlename.select('h3[@class="name"]/a/@href').extract()
>>>>> for titleurls in titleurl:
>>>>>  preg=re.match('^http',titleurls)
>>>>> if preg:
>>>>> titleurls=titleurls
>>>>>  else:
>>>>> titleurls="http://www.infosec.co.uk"+titleurls
>>>>>  yield Request(url=titleurls,callback=self.getwebsitename)
>>>>>  
>>>>> def getwebsitename(self,response):
>>>>> hxs= HtmlXPathSelector(response)
>>>>> websites= hxs.select('//li[@class="web"]/a/@href').extract()
>>>>>  for websitename in websites:
>>>>> return websites
>>>>>
>>>>>  -- 
>>>> You received this message because you are subscribed to the Google 
>>>> Groups "scrapy-users" group.
>>>> To unsubscribe from this group and stop receiving emails from it, send 
>>>> an email to [email protected].
>>>> To post to this group, send email to [email protected].
>>>>
>>>> Visit this group at http://groups.google.com/group/scrapy-users.
>>>> For more options, visit https://groups.google.com/d/optout.
>>>>
>>>
>>>  -- 
>> You received this message because you are subscribed to the Google Groups 
>> "scrapy-users" group.
>> To unsubscribe from this group and stop receiving emails from it, send an 
>> email to [email protected] <javascript:>.
>> To post to this group, send email to [email protected]<javascript:>
>> .
>> Visit this group at http://groups.google.com/group/scrapy-users.
>> For more options, visit https://groups.google.com/d/optout.
>>
>
>

-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.

Reply via email to