Hi all,
I'm having trouble with developing a crawl spider. I want the spider to
extract every "a" tag it finds and follow it before writing the response to
disk and populating a list
I had a basic spider working that would use BeautifulSoup to get the a tags
and create an absolute url and save the title of the start page, link url
and link text but stepping up to the next stage where the spider will
follow the links is proving a step too far for me.
Can anyone help me out and would the code below do as I expect and follow
all the urls it finds, write the response to disk and store the meta data
details in items ?
Michael
import scrapy
import os
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
from librarycrawler.items import LibrarycrawlerItem
class CrawlSpider(scrapy.Spider):
name = "librarycrawler"
allowed_domains = ["ul.ie"]
start_urls = [
"http://www2.ul.ie/web/WWW/Services/Library"
]
#rules = (
# Extract links matching 'category.php' (but not matching
'subsection.php')
# and follow links from them (since no callback means follow=True
by default).
# Rule(LinkExtractor(allow=('category\.php', ),
deny=('subsection\.php', ))),
# Extract links matching 'item.php' and parse them with the
spider's method parse_item
# Rule(LinkExtractor(allow=('item\.php', )), callback='parse_item'),
#)
def parse(self, response):
#Could probably strain it so only "a" tags are parsed but still only
getting to grips with it
counter =1
soup = BeautifulSoup(response.body,"html.parser")
for link in soup.find_all("a"):
#Version 3 Get every url and follow it
url = response.urljoin(link.get('href'))
yield
scrapy.Request(url,counter,link.get_text(),link.get('href'),self.parse_page)
counter +=1
def parse_page(self,countervar,LinkTitle,Linkhref,response):
page_soup = BeautifulSoup(response.body,"html.parser")
filename = os.relpath("ExtractedText/" + countervar + ".html")
ScrapedPageTitle = page_soup.title.get_text()
item['title'] =ScrapedPageTitle
item['text'] = self
item['href'] = Linkhref
item['abslink'] = response.urljoin(link.get(Linkhref))
with open(filename, 'wb') as f:
f.write(response.body)
yield item
--
You received this message because you are subscribed to the Google Groups
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To post to this group, send email to [email protected].
Visit this group at https://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.