The goal is to capture all the outerHTML within class="Item--content--12o-RdR"
Here is the URL I am trying to scrape from https://item.taobao.com/item.htm?id=767876514653
However some elements won’t load even when I try to target them. Example: 颜色分类:
These elements are only available when I inspect element.
Help is appreciated I am relatively new to scraping still.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException
def load_and_capture_content(url):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
driver.get(url)
element_content = None
try:
xpath = '//*[@id="root"]/div/div[2]/div[2]/div[2]'
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, xpath)))
# Locate the target element
target_element = driver.find_element(By.XPATH, xpath)
# Use JavaScript to get the outerHTML to ensure dynamic attributes are included
element_content = driver.execute_script("return arguments[0].outerHTML;", target_element)
except TimeoutException:
print("Timed out waiting for the element to load. Capturing only the full page content.")
full_page_html = driver.page_source
driver.quit()
return element_content, full_page_html
url = "https://item.taobao.com/item.htm?id=767876514653"
if __name__ == "__main__":
element_content, full_page_html = load_and_capture_content(url)
if element_content:
print("Element content captured.")
with open('zxpath.html', 'w', encoding='utf-8') as file:
file.write(element_content)
print("Full page content captured.")
with open('xfull_page_content.html', 'w', encoding='utf-8') as file:
file.write(full_page_html)
I've tried:
- implementing scroll
- waiting longer
- targeting the sepcific element (I've used both XPath and CSS selector)
but none seemed to work.