How to obtain new html content after clicking button?

296 views Asked by At

I am trying to scrape Java-Script heavy website such as the following: https://schifferstadt.more-rubin1.de/meeting.php?id=ni_2022-01-147 Therefore I am using scrapy-playwright, to obtain the text within the documents. When clicking on a document, pdf-viewer shows up and the content of the pdf can actually be obtains from the html code. I managed to extract the text of the pdf with playwright but unfortunately not with scrapy-playwright. When I want the html code of the current page, but scrapy-playwright responds the html code of the starting website.

import scrapy
from scrapy_playwright.page import PageMethod
from scrapy.selector import Selector

class SchifferstadtSpider(scrapy.Spider):
    name = 'schifferstadt'

    def start_requests(self):
        url = 'https://schifferstadt.more-rubin1.de/meeting.php?id=ni_2022-01-147'
        yield scrapy.http.Request(url, 
                                  callback = self.parse_sitzungen,
                                  meta = dict(playwright = True,
                                  playwright_include_page = True, 
                                  playwright_page_methods = [PageMethod('wait_for_selector', 'div.tabs-details')],),
                                  errback=self.errback_close_page,
                )

    async def parse_sitzungen(self, response):

        page = response.meta["playwright_page"]
        
        await page.get_by_role("button", name="Bekanntmachung").click()
        await page.wait_for_load_state("networkidle")

        html = await page.content()
        html = Selector(text = html)
        content = ' '.join(html.xpath("//div[@class='page-text-layer']/span/text()").getall())
         
        await page.locator(".file-viewer-modal-document-close-button").click()
        await page.close()

        yield {'content': content}

    async def errback_close_page(self, failure):
        page = failure.request.meta["playwright_page"]
        await page.close()

I am expecting the dynamically loaded html code with the text of the pdf after opening the document in the in-built pdf viewer on the website.

0

There are 0 answers