Scrapy `ReactorNotRestartable`: one class to run two (or more) spiders

7.1k views Asked by At

I'm aggregating daily data with Scrapy using a two-stage crawl. The first stage generates a list of URL's from an index page and the second stage writes the HTML, for each of the URL's in the list, to a Kafka topic.

kafka cluster for Scrapy crawler

Although the two components of the crawl are related, I'd like them to be independent: the url_generator would run as a scheduled task once a day, and the page_requester would run continually, processing URL's when available. For the sake of being "polite", I shall adjust the DOWNLOAD_DELAY so that the crawler finishes well within the 24 hour period, but puts minimal load on the site.

I created a CrawlerRunner class that has functions to generate the URL's and retrieve the HTML:

from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from scrapy_somesite.spiders.create_urls_spider import CreateSomeSiteUrlList
from scrapy_somesite.spiders.crawl_urls_spider import SomeSiteRetrievePages
from scrapy.utils.project import get_project_settings
import os
import sys

class CrawlerRunner:

    def __init__(self):
        sys.path.append(os.path.join(os.path.curdir, "crawl/somesite"))
        os.environ['SCRAPY_SETTINGS_MODULE'] = 'scrapy_somesite.settings'
        self.settings = get_project_settings()
        log.start()

    def create_urls(self):
        spider = CreateSomeSiteUrlList()
        crawler_create_urls = Crawler(self.settings)
        crawler_create_urls.signals.connect(reactor.stop, signal=signals.spider_closed)
        crawler_create_urls.configure()
        crawler_create_urls.crawl(spider)
        crawler_create_urls.start()
        reactor.run()

    def crawl_urls(self):
        spider = SomeSiteRetrievePages()
        crawler_crawl_urls = Crawler(self.settings)
        crawler_crawl_urls.signals.connect(reactor.stop, signal=signals.spider_closed)
        crawler_crawl_urls.configure()
        crawler_crawl_urls.crawl(spider)
        crawler_crawl_urls.start()
        reactor.run()

When I instantiate the class, I'm able to successfully execute either function on its own but, unfortunately, I'm unable to execute them together:

from crawl.somesite import crawler_runner

cr = crawler_runner.CrawlerRunner()

cr.create_urls()
cr.crawl_urls()

The second function call generates a twisted.internet.error.ReactorNotRestartable when it tries to execute reactor.run() in the crawl_urls function.

I'm wondering if there's an easy fix for this code (e.g. some way to run two separate Twisted reactors), or if there's a better way to structure this project.

1

There are 1 answers

1
Alex Woolford On BEST ANSWER

It was possible to run multiple spiders within one reactor by keeping the reactor open until all the spiders have stopped running. This was achieved by keeping a list of all the running spiders and not executing reactor.stop() until this list is empty:

import sys
import os
from scrapy.utils.project import get_project_settings
from scrapy_somesite.spiders.create_urls_spider import Spider1
from scrapy_somesite.spiders.crawl_urls_spider import Spider2

from scrapy import signals, log
from twisted.internet import reactor
from scrapy.crawler import Crawler

class CrawlRunner:

    def __init__(self):
        self.running_crawlers = []

    def spider_closing(self, spider):
        log.msg("Spider closed: %s" % spider, level=log.INFO)
        self.running_crawlers.remove(spider)
        if not self.running_crawlers:
            reactor.stop()

    def run(self):

        sys.path.append(os.path.join(os.path.curdir, "crawl/somesite"))
        os.environ['SCRAPY_SETTINGS_MODULE'] = 'scrapy_somesite.settings'
        settings = get_project_settings()
        log.start(loglevel=log.DEBUG)

        to_crawl = [Spider1, Spider2]

        for spider in to_crawl:

            crawler = Crawler(settings)
            crawler_obj = spider()
            self.running_crawlers.append(crawler_obj)

            crawler.signals.connect(self.spider_closing, signal=signals.spider_closed)
            crawler.configure()
            crawler.crawl(crawler_obj)
            crawler.start()

        reactor.run()

The class is executed:

from crawl.somesite.crawl import CrawlRunner

cr = CrawlRunner()
cr.run()

This solution was based on a blogpost by Kiran Koduru.