Selenium image download when browser is minimized, headless mode

70 views Asked by nyx707 At 08 October 2023 at 04:57

This is a problem in progress following the previous work to receive images from web pages to avoid interference such as cloudflare.

The method to receive images is get response data of image loaded on the screen from Chrome devtools when I enter the url.

If I put the chrome browser at the front of the screen and read it by scrolling down autonomously, all images will download well.

However, if the browser is placed so that it is hidden by headless, minimize, or other windows, the images are not cached and downloaded.

Is there any way to solve this?

from selenium import webdriver
from selenium.webdriver.chrome.options import Options 
import time
import subprocess
import os
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import datetime,time
import base64
import json

current_dir = os.path.dirname(os.path.realpath(__file__))
down_dir = os.path.join(current_dir, 'download')

path = "chromedriver.exe"
sp = subprocess.Popen(f'C://Program Files//Google//Chrome//Application//chrome.exe --remote-debugging-port=9222 --auto-open-devtools-for-tabs --ignore-certificate-errors') 
# --headless 

option = Options()
option.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
option.set_capability('goog:loggingPrefs', {'performance': 'ALL'})

browser = webdriver.Chrome(options=option)
# browser.minimize_window() # it is not working for image download, headless too

# start crawling
browser.get(url)

html_source = browser.page_source
bs = BeautifulSoup(html_source, features="lxml")
title = bs.find("title").text

page_contents = bs.find("div", class_="listing")

def move_end(browser):  
    ################# slowly scroll down
    total_page_height = browser.execute_script("return document.body.scrollHeight")
    browser_window_height = browser.get_window_size(windowHandle='current')['height']
    current_position = browser.execute_script('return window.pageYOffset')
    while total_page_height - current_position > browser_window_height:
        browser.execute_script(f"window.scrollTo({current_position}, {browser_window_height + current_position});")
        current_position = browser.execute_script('return window.pageYOffset')
        total_page_height = browser.execute_script("return document.body.scrollHeight")
        time.sleep(0.4)  # It is necessary here to give it some time to
              
# collect all sub pages
for td in page_contents.findAll('a'):
    page_title = td.text.strip()

    if page_title: 
        href = td.attrs['href']
        href = urljoin(url, href)        
        browser.get(href)
        time.sleep(2)

        # extract images from each sub page
        html_source = browser.page_source
        bs = BeautifulSoup(html_source, features="lxml")
        image_contents = bs.find("div", class_="content")
        img_urls = [image['data-src'] for image in image_contents.findAll('img')]
        
        # move to end of page for get image contents
        move_end(browser)
        time.sleep(1)
        os.makedirs(os.path.join(down_dir, title, page_title), exist_ok=True)

        # get response log
        logs = browser.get_log("performance")

        for log in logs:
            message = log["message"]
            if "Network.responseReceived" in message:
                params = json.loads(message)["message"].get("params")
                if params:
                    response = params.get("response")
                    if response:
                        if response and response["url"] in img_urls:
                            try:
                                body = browser.execute_cdp_cmd('Network.getResponseBody', {'requestId': params["requestId"]})
                                file_name = response["url"].split('/')[-1]
                                with open(os.path.join(down_dir, title, page_title, file_name), 'wb') as img:
                                    img.write(base64.b64decode(body['body']))
                                img_urls.remove(response["url"])
                            except:
                                pass     
    else:
        continue
sp.terminate()

Original Q&A

TechQA.

Selenium image download when browser is minimized, headless mode

There are 0 answers

Related Questions in PYTHON

Related Questions in SELENIUM-WEBDRIVER

Related Questions in GOOGLE-CHROME-DEVTOOLS

Related Questions in GOOGLE-CHROME-HEADLESS

Related Questions in IMAGE-CACHING

Popular Questions

Trending Questions