I am trying to scrape Java-Script heavy website such as the following: https://schifferstadt.more-rubin1.de/meeting.php?id=ni_2022-01-147 Therefore I am using scrapy-playwright, to obtain the text within the documents. When clicking on a document, pdf-viewer shows up and the content of the pdf can actually be obtains from the html code. I managed to extract the text of the pdf with playwright but unfortunately not with scrapy-playwright. When I want the html code of the current page, but scrapy-playwright responds the html code of the starting website.
import scrapy
from scrapy_playwright.page import PageMethod
from scrapy.selector import Selector
class SchifferstadtSpider(scrapy.Spider):
name = 'schifferstadt'
def start_requests(self):
url = 'https://schifferstadt.more-rubin1.de/meeting.php?id=ni_2022-01-147'
yield scrapy.http.Request(url,
callback = self.parse_sitzungen,
meta = dict(playwright = True,
playwright_include_page = True,
playwright_page_methods = [PageMethod('wait_for_selector', 'div.tabs-details')],),
errback=self.errback_close_page,
)
async def parse_sitzungen(self, response):
page = response.meta["playwright_page"]
await page.get_by_role("button", name="Bekanntmachung").click()
await page.wait_for_load_state("networkidle")
html = await page.content()
html = Selector(text = html)
content = ' '.join(html.xpath("//div[@class='page-text-layer']/span/text()").getall())
await page.locator(".file-viewer-modal-document-close-button").click()
await page.close()
yield {'content': content}
async def errback_close_page(self, failure):
page = failure.request.meta["playwright_page"]
await page.close()
I am expecting the dynamically loaded html code with the text of the pdf after opening the document in the in-built pdf viewer on the website.