How to recursively crawl whole website using scrapy

4k views Asked by At

I want to crawl complete website using scrapy but right now its only crawling single page

import scrapy
from scrapy.http import HtmlResponse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.exporter import JsonItemExporter

class IzodspiderSpider(scrapy.Spider):


name = 'izodspider'
allowed_domains = ['izod.com']
start_urls = ['http://izod.com/']
rules = [Rule(SgmlLinkExtractor(), callback='parse_item', follow=True)]
def parse(self, response):
    hxs = scrapy.Selector(response)
    meta = hxs.xpath('//meta[@name=\'description\']/@content').extract()
    name = hxs.xpath('//div[@id=\'product-details\']/h5').extract()
    desc = hxs.xpath('//div[@id=\'product-details\']/p').extract()

is there any way to extract meta tags using portia ?

1

There are 1 answers

0
aberna On

There is an error in the rule definition and inside the callback.

Since the parse function you use is parse_item you have to call it inside the callback instead of parse

You can find more information about the callback function on the documentation here http://doc.scrapy.org/en/latest/topics/request-response.html?highlight=callback#topics-request-response-ref-request-callback-arguments

class IzodspiderSpider(CrawlSpider):
name = "izod"
depth_limit= 0 
bot_name = 'izod'
allowed_domains = ['izod.com']
start_urls = ['http://www.izod.com']
rules = (
     Rule(SgmlLinkExtractor(allow=('')), callback='parse_items',follow= True),
     )

def parse_items(self, response):
    hxs = scrapy.Selector(response)
    meta = hxs.xpath('//meta[@name=\'description\']/@content').extract()
    name = hxs.xpath('//div[@id=\'product-details\']/h5').extract()
    desc = hxs.xpath('//div[@id=\'product-details\']/p').extract()