How can I load all the elements of a webpage with Selenium?

29 views Asked by At

The goal is to capture all the outerHTML within class="Item--content--12o-RdR"

Here is the URL I am trying to scrape from https://item.taobao.com/item.htm?id=767876514653

However some elements won’t load even when I try to target them. Example:

 颜色分类:

These elements are only available when I inspect element.

Help is appreciated I am relatively new to scraping still.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException

def load_and_capture_content(url):
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")

    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
    driver.get(url)

    element_content = None

    try:
        xpath = '//*[@id="root"]/div/div[2]/div[2]/div[2]'
        WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, xpath)))

        # Locate the target element
        target_element = driver.find_element(By.XPATH, xpath)

        # Use JavaScript to get the outerHTML to ensure dynamic attributes are included
        element_content = driver.execute_script("return arguments[0].outerHTML;", target_element)
    except TimeoutException:
        print("Timed out waiting for the element to load. Capturing only the full page content.")

    full_page_html = driver.page_source
    driver.quit()

    return element_content, full_page_html

url = "https://item.taobao.com/item.htm?id=767876514653"
if __name__ == "__main__":
    element_content, full_page_html = load_and_capture_content(url)
    
    if element_content:
        print("Element content captured.")
        with open('zxpath.html', 'w', encoding='utf-8') as file:
            file.write(element_content)

    print("Full page content captured.")
    with open('xfull_page_content.html', 'w', encoding='utf-8') as file:
        file.write(full_page_html)

I've tried:

  1. implementing scroll
  2. waiting longer
  3. targeting the sepcific element (I've used both XPath and CSS selector)

but none seemed to work.

0

There are 0 answers