I'm trying to scrape images off a site using the default ImagePipeline 
scrapy has. However, I encounter this error.

    Traceback (most recent call last):
  File 
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/middleware.py",
 line 62, in _process_chain
    return process_chain(self.methods[methodname], obj, *args)
  File 
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/utils/defer.py",
 line 65, in process_chain
    d.callback(input)
  File 
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/twisted/internet/defer.py",
 line 383, in callback
    self._startRunCallbacks(result)
  File 
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/twisted/internet/defer.py",
 line 491, in _startRunCallbacks
    self._runCallbacks()
--- <exception caught here> ---
  File 
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/twisted/internet/defer.py",
 line 578, in _runCallbacks
    current.result = callback(current.result, *args, **kw)
  File 
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/contrib/pipeline/media.py",
 line 40, in process_item
    requests = arg_to_iter(self.get_media_requests(item, info))
  File 
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/contrib/pipeline/images.py",
 line 104, in get_media_requests
    return [Request(x) for x in item.get(self.IMAGES_URLS_FIELD, [])]
  File 
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/http/request/__init__.py",
 line 26, in __init__
    self._set_url(url)
  File 
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/http/request/__init__.py",
 line 57, in _set_url
    self._set_url(url.encode(self.encoding))
  File 
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/http/request/__init__.py",
 line 61, in _set_url
    raise ValueError('Missing scheme in request url: %s' % self._url)
exceptions.ValueError: Missing scheme in request url: h

My code:

# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from extrapetite.items import ExtrapetiteItem


class ExtrapetiteCsSpider(CrawlSpider):
    name = 'extrapetite_cs'
    allowed_domains = ['www.extrapetite.com']
    start_urls = ['http://www.extrapetite.com/']

    rules = [
        Rule(
            LinkExtractor(allow=['/search/label/Lookbook']),
            callback='parse_link', 
            follow=True
            )
    ]

    def parse_link(self, response):
        for thing in 
response.xpath('//*[@id="Blog1"]/div[1]/div[position()>2]/div/div/div/div[3]/a/@href'):
            request = scrapy.Request(thing.extract(), callback=self.parse_img) 
            yield request



    def parse_img(self,response):
        for thing in 
response.xpath('//*[@id="Blog1"]/div[1]/div/div/div[1]/div[1]/div[2]/a[position()>0]/img'):
            item = ExtrapetiteItem()
            item['image_urls'] = thing.xpath('@src').extract()[0]
            item['url'] = response.url
            item['desc'] = thing.xpath('@alt').extract()[0]
            yield item

My settings:

# -*- coding: utf-8 -*-

# Scrapy settings for extrapetite project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#

BOT_NAME = 'extrapetite'

SPIDER_MODULES = ['extrapetite.spiders']
NEWSPIDER_MODULE = 'extrapetite.spiders'

FEED_URI = 'logs/%(time)s.csv'
FEED_FORMAT = 'csv'

ITEM_PIPELINES = {
    'scrapy.contrib.pipeline.images.ImagesPipeline': 1
}

IMAGES_STORE = '/Users/crescal/compsci/ggslh/extrapetite/images/'
IMAGES_EXPIRES = 90


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'extrapetite (+http://www.yourdomain.com)'

Without the ImagePipeline enabled, I am able to crawl the site and obtain 
the image_url, url, and desc.

Also, when I try to view the link using scrapy view
https://farm9.staticflickr.com/8802/16977938339_20f05dc232_o.jpg

I get this error

    2015-05-08 13:31:37+0800 [default] DEBUG: Crawled (200) <GET 
https://farm9.staticflickr.com/8802/16977938339_20f05dc232_o.jpg> (referer: 
None)
2015-05-08 13:31:37+0800 [default] ERROR: Spider error processing <GET 
https://farm9.staticflickr.com/8802/16977938339_20f05dc232_o.jpg>
    Traceback (most recent call last):
      File 
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/twisted/internet/base.py",
 line 1201, in mainLoop
        self.runUntilCurrent()
      File 
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/twisted/internet/base.py",
 line 824, in runUntilCurrent
        call.func(*call.args, **call.kw)
      File 
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/twisted/internet/defer.py",
 line 383, in callback
        self._startRunCallbacks(result)
      File 
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/twisted/internet/defer.py",
 line 491, in _startRunCallbacks
        self._runCallbacks()
    --- <exception caught here> ---
      File 
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/twisted/internet/defer.py",
 line 578, in _runCallbacks
        current.result = callback(current.result, *args, **kw)
      File 
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/commands/fetch.py",
 line 47, in <lambda>
        cb = lambda x: self._print_response(x, opts)
      File 
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/commands/view.py",
 line 20, in _print_response
        open_in_browser(response)
      File 
"/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/utils/response.py",
 line 86, in open_in_browser
        response.__class__.__name__)
    exceptions.TypeError: Unsupported response type: Response   

The content type of the url is image/jpeg, why is that I am unable to view 
this?

-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.

Reply via email to