very new to Scrapy in particular and somewhat new to coding in general.
I'm trying to parse some data for my school project from this website: https://www.brickeconomy.com/sets/theme/sets/theme/ninjago
I want to parse data from a page, then move onto the next one and parse similar data from that one. However, since the "Next" page button is not a simple link but a Javascript command, I've set up the code to use a LUA script to simulate pressing the button to move to the next page and receive data from there, which looked something like this:
import scrapy
from scrapy_splash import SplashRequest
script = """
function main(splash, args)
assert(splash:go(args.url))
local c = args.counter
for i=1,c do
local button = splash:select_all('a.page-link')[12]
button:click()
assert(splash:wait(5))
end
return splash:html()
end
"""
class LegoTestSpider(scrapy.Spider):
name = 'legotest'
def start_requests(self):
url = 'https://www.brickeconomy.com/sets/theme/ninjago'
yield SplashRequest(
url=url,
callback=self.parse,
endpoint='execute',
args={'wait': 1, 'lua_source': script, 'url': url}
)
def parse(self, response):
products = response.css('div.mb-5')
for product in products:
yield {
'name': product.css('h4 a::text').get(),
'link': product.css('h4 a').attrib['href']
}
However, although this worked, I wanted to be able to create a loop that went through all the pages and then returned data parsed from every single page.
I attempted to create something like this:
import scrapy
from scrapy_splash import SplashRequest
lua_script = """
function main(splash, args)
assert(splash:go(args.url))
while not splash:select('div.mb-5') do
splash:wait(0.1)
print('waiting...')
end
return {html=splash:html()}
end
"""
script = """
function main(splash, args)
assert(splash:go(args.url))
local c = args.counter
for i=1,c do
local button = splash:select_all('a.page-link')[12]
button:click()
assert(splash:wait(5))
end
return splash:html()
end
"""
class LegoTestSpider(scrapy.Spider):
name = 'legotest'
def start_requests(self):
url = 'https://www.brickeconomy.com/sets/theme/ninjago'
yield SplashRequest(
url=url,
callback=self.parse,
endpoint='execute',
args={'wait': 1, 'lua_source': lua_script, 'url': url}
)
def parse(self, response):
# Checks if it's the last page
page_numbers = response.css('table.setstable td::text').getall()
counter = -1
while page_numbers[1] != page_numbers[2]:
counter += 1
yield SplashRequest(
url='https://www.brickeconomy.com/sets/theme/ninjago',
callback=self.parse_nextpage,
endpoint='execute',
args={'wait': 1, 'lua_source': script, 'url': 'https://www.brickeconomy.com/sets/theme/ninjago','counter': counter}
)
def parse_nextpage(self, response):
products = response.css('div.mb-5')
for product in products:
yield {
'name': product.css('h4 a::text').get(),
'link': product.css('h4 a').attrib['href']
}
However, when I run this code, it returns the first page of data, then gives a timeout error:
2024-02-18 17:26:18 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET https://www.brickeconomy.com/sets/theme/ninjago via http://localhost:8050/execute> (failed 1 times): 504 Gateway Time-out
I'm not sure why this happens, and would like to find a solution to fix it.