I am currently working with scrapy-playwright and trying to scrape the following url https://www.paniniamerica.net/checklist but got the following error
web_1 | Traceback (most recent call last):
web_1 | File "/.venv/lib/python3.10/site-packages/playwright/_impl/_connection.py", line 39, in send
web_1 | return await self.inner_send(method, params, False)
web_1 | File "/.venv/lib/python3.10/site-packages/playwright/_impl/_connection.py", line 63, in inner_send
web_1 | result = next(iter(done)).result()
web_1 | playwright._impl._api_types.Error: headers[6].value: expected string, got object
System info
Playwright Version: 1.19
Operating System: Ubuntu 18
Browser: Chromium
Other info:
Scrapy Version: ^2.7.1
Scrapy-Playwright: ^0.0.26
Couldn't get by this happening. Please see the spider code bellow
class ExSpider(scrapy.Spider):
name = "ex_spider"
custom_settings = {
'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor',
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
'DOWNLOAD_HANDLERS': {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
'SCRAPY_PLAYWRIGHT_BROWSER_TYPE': 'chromium',
'PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT': 0 * 1000,
'PLAYWRIGHT_CONTEXTS': {
"default": {
"viewport": {
"width": 1920,
"height": 980,
}
}
},
'CONCURRENT_REQUESTS': 20,
'CONCURRENT_REQUESTS_PER_DOMAIN': 20,
'CONCURRENT_ITEMS': 20,
'REACTOR_THREADPOOL_MAXSIZE': 20,
'RETRY_TIMES': 3,
'PLAYWRIGHT_ABORT_REQUEST': should_abort_request,
}
def start_requests(self):
url = "https://www.paniniamerica.net/checklist"
logger.info("Start the scraper")
req = scrapy.Request(url,
callback=self.parse_type,
meta=dict(
playwright=True,
playwright_context="default",
errback=self.errback,
playwright_include_page=True,
playwright_page_methods=[
PageMethod("wait_for_selector", "select#damage_type", timeout=0 * 1000,
state="visible"),
PageMethod("wait_for_load_state", "load")
]
))
print(req.headers)
yield req
async def parse_type(self, response):
print("Here")
page = response.meta["playwright_page"]
await page.close()
logger.info("Check point")
select_types = response.css("select#damage_type::text").extract()
logger.info(f"select type are {select_types}")
Thanks in advance