I'm scrape the reviews data from glassdoor, and I test my code succesfully when I first finish the edit, but when I need to use them for large scale scrape, it always have a verify from cloudflare, and how should I passby it, here is my code, especially function:
import csv
import time
import random
import logging
from selenium import webdriver as wd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
DEFAULT_URL = 'https://www.glassdoor.com/Reviews/index.htm?overall_rating_low=3.5&page=1&locId=1140588&locType=C'
COMPANY_NAMES = ['Super Microsoft Technology', 'NICE Holdings']
USERNAME = 'username'
PASSWORD = 'password'
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
logger.addHandler(ch)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(lineno)d:%(filename)s(%(process)d) - %(message)s')
ch.setFormatter(formatter)
def sign_in():
url = 'https://www.glassdoor.com/profile/login_input.htm'
browser.get(url)
email_field = browser.find_element('css selector', 'input[autocomplete="username"]')
email_field.send_keys(USERNAME)
time.sleep(3)
next_button = browser.find_element('xpath', '//*[@id="InlineLoginModule"]/div/div[1]/div/div/div/div/form/div[2]/button')
next_button.click()
time.sleep(3)
password_field = browser.find_element('css selector', 'input[autocomplete="current-password"]')
submit_btn = browser.find_element('xpath', '//*[@id="InlineLoginModule"]/div/div[1]/div/div/div/div/form/div[2]/button')
password_field.send_keys(PASSWORD)
submit_btn.click()
time.sleep(3)
browser.get(DEFAULT_URL)
def navigate_to_reviews(company_name):
browser.get(DEFAULT_URL)
time.sleep(random.uniform(1, 3))
search_field = WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#companyAutocomplete-companyDiscover-employerSearch')))
search_field.send_keys(company_name)
search_button = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.gd-ui-button[data-test="company-search-button"]')))
search_button.click()
time.sleep(random.uniform(1, 3))
search_results = WebDriverWait(browser, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.d-flex.flex-column.my-std.mb-sm-0.css-1b46kjl a')))
if search_results:
first_result = search_results[0]
first_result.click()
time.sleep(random.uniform(1, 3))
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '//a[@data-test="ei-nav-reviews-link"]')))
reviews_cell = browser.find_element('xpath', '//a[@data-test="ei-nav-reviews-link"]')
reviews_path = reviews_cell.get_attribute('href')
browser.get(reviews_path)
time.sleep(random.uniform(1, 3))
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, '//*[starts-with(@id, "empReview")]')))
reviews = []
while True:
review_elements = browser.find_elements('xpath', '//*[starts-with(@id, "empReview")]')
for review_element in review_elements:
review = extract_pros_cons(review_element)
reviews.append(review)
next_button = browser.find_elements('xpath', '//button[@data-test="next-page" and not(@disabled)]')
if not next_button:
break
next_button[0].click()
time.sleep(random.uniform(1, 3))
WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.XPATH, '//*[starts-with(@id, "empReview")]')))
return reviews
else:
print("No search results found.")
return []
def get_browser():
logger.info('Configuring browser')
chrome_options = wd.EdgeOptions()
chrome_options.add_argument('log-level=3')
chrome_options.add_argument('--incognito')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('--disable-setuid-sandbox')
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36')
browser = wd.Edge(options=chrome_options)
return browser
def extract_pros_cons(review_element):
pros_element = review_element.find_element('css selector', '.review-details_pro__rMvtX span[data-test="review-text-pros"]')
cons_element = review_element.find_element('css selector', '.review-details_con__9IvnD span[data-test="review-text-cons"]')
date_element = review_element.find_element('css selector', '.timestamp_reviewDate__fBGY6')
employee_status_element = review_element.find_element('css selector', '.review-details_employeeDetails__LuKJ7')
overall_rating_element = review_element.find_element('css selector', '.review-details_overallRating__Rxhdr')
show_more_button = review_element.find_elements('css selector', '.review-details_showMoreButton__x_JZx button')
if show_more_button:
show_more_button[0].click()
time.sleep(random.uniform(1, 3))
pros_text = pros_element.text.strip()
cons_text = cons_element.text.strip()
date_text = date_element.text.strip()
employee_status_text = employee_status_element.text.strip()
overall_rating_text = overall_rating_element.text.strip()
return {
"pros": pros_text,
"cons": cons_text,
"date": date_text,
"employee_status": employee_status_text,
"overall_rating": float(overall_rating_text)
}
def write_to_csv(company_name, reviews):
fieldnames = ['pros', 'cons', 'date', 'employee_status', 'overall_rating']
filename = f"{company_name}.csv"
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for review in reviews:
writer.writerow(review)
if __name__ == "__main__":
browser = get_browser()
sign_in()
for company_name in COMPANY_NAMES:
reviews = navigate_to_reviews(company_name)
write_to_csv(company_name, reviews)
browser.quit()
some elements may seems weird but exactly pass my test, and how could I pass the verify to scrape about 1,000 companies's review
a lot thanks for help!!!