I'm trying to use scrapy to find image URLs that are used more than once on a website across all pages.

This is my spider:

# -*- coding: utf-8 -*-
from collections import defaultdict

import scrapy
from scrapy import signals
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

class ExampleSpider(CrawlSpider):
    handle_httpstatus_list = [403, 404]
    name = 'Example'
    allowed_domains = ['url.com']
    start_urls = ['http://url.com/']
    custom_settings = {
      'LOG_LEVEL': 'INFO'
    count_image_occurrences = defaultdict(int)

    rules = (
        Rule(LinkExtractor(tags='a', attrs='href', unique=True),
             callback='parse_item', follow=True),

    def parse_item(self, response):
        # Remember images. 
        for image in response.xpath('//img/@src').extract():
            self.count_image_occurrences[image] += 1

    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(ExampleSpider, cls).from_crawler(crawler, *args, **kwargs)

        return spider

    def spider_closed(self, spider):

Is there a more (speed / memory / code length) efficient way to do this?

0 Answers