A program that I wrote using scrapy and scrapy-playwright only seems to load the periphery elements of the page. The "meat of the page" remains blank, but unfortunately that is the information I'm trying to scrape from
https://chrome.google.com/webstore/category/ext/22-accessibility
import scrapy
from scrapy_playwright.page import PageMethod
import asyncio
class ExtensionSpider(scrapy.Spider):
name = "extension"
allowed_domains = ["chrome.google.com"]
def start_requests(self):
yield scrapy.Request(
url='https://chrome.google.com/webstore/category/ext/22-accessibility',
meta={
'playwright': True,
'playwright_include_page': True,
'playwright_page_method': [
PageMethod('wait_for_selector', '//h1'),
PageMethod('evaluate', 'window.scrollBy(0, document.body.scrollHeight)'),
PageMethod('wait_for_timeout', 30000),
],
'errback': self.errback,
},
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
},
callback=self.parse,
)
async def parse(self, response):
page = response.meta["playwright_page"]
h1_element = response.xpath('//h1/text()').get()
grids = response.xpath('//div[@role="grid"]').getall()
screenshot = await page.screenshot(path="example.png", full_page=True)
await page.close()
yield {
'H1 Loaded': h1_element,
'Number of grids': len(grids)
}
async def errback(self, failure):
page = failure.request.meta["playwright_page"]
await page.close()
I'm having a strange problem with my code, basically only the periphery elements of the page load (see image attached), i want to access the 'meat of the page' but no matter if i type wait_for_selector or wait_for_timeout i only get the sidebar and header of the page
as you can see from the screenshot of my page, only the header and sidebar loads
