I'm building a web scraper with Flask, Celery, and Selenium. The scraper runs without any issues on my local machine, but when I try to run it on the production server behind NGINX, I get the following error:
selenium.common.exceptions.WebDriverException: Message: Process unexpectedly closed with status 1
Scraper:
from selenium.webdriver.firefox.service import Service
import sys, os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
from dotenv import load_dotenv
from check_email import EmailedReferrals, Referrals
from sqlalchemy import create_engine, Column, Integer, String, Date, Text
from sqlalchemy.orm import declarative_base
from sqlalchemy.orm import sessionmaker
from datetime import datetime
from check_email import EmailedReferrals, Referrals
import logging
logger = logging.getLogger('celery')
logger.setLevel(logging.DEBUG)
load_dotenv()
env_conf = os.getenv("CONFIG")
Base = declarative_base()
if env_conf == "dev":
driver = webdriver.Firefox()
DATABASE_URL = os.getenv('DEV_DATABASE_URL')
else:
# Set up Firefox options
options = Options()
# Set window size
options.add_argument("--headless=new")
options.add_argument("--start-maximized")
options.add_argument("--window-size=1920, 1080")
#options.add_argument("user-agent= Chrome/111.0.5563.64") # need to update firefox here
options.add_argument("--no-sandbox")
options.add_argument("--disable-extensions")
options.add_argument("--disable-gpu")
options.add_argument('--disable-dev-shm-usage')
# Include the path to the Firefox binary
firefox_binary_path = '/usr/bin/firefox'
geckodriver_path = '/home/fusion/bin/geckodriver'
options.binary_location = firefox_binary_path
service = Service(executable_path=geckodriver_path)
driver = webdriver.Firefox(options=options)
DATABASE_URL = os.getenv('PROD_DATABASE_URL')
engine = create_engine(DATABASE_URL)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
def get_row_xpath(row, cell): #pulls xpath of rows and cells from Incoming Referrals View
ctl = str(row).zfill(2)
action = row - 3
id_cell = f"""/html/body/form/table/tbody/tr[4]/td[2]/table/tbody/tr/td/table/tbody/tr[2]/td/div/table/tbody\
/tr/td[3]/div/table/tbody/tr[2]/td/div/table/tbody/tr[{row}]/td[{cell}]"""
select_element = f"""//*[@id="ucViewGrid_dgView_ctl{ctl}_Actions_{action}_0_0_ActionItems"]"""
go_button = f"""#ucViewGrid_dgView_ctl{ctl}_Actions_{action}_0_0_ActionButton > input:nth-child(1)"""
return id_cell, select_element, go_button
def main(referral_id_from_flask, comment):
print('hello from main')
logger.info("hello from main")
driver.get("https://example.com")
#Click Account and sign in
driver.find_element(By.XPATH, '//*[@id="UserNameTextBox"]').send_keys(os.getenv('CAREPORT_USERNAME'))
driver.find_element(By.XPATH, '//*[@id="PasswordTextBox"]').send_keys(os.getenv('CAREPORT_PASSWORD'))
driver.find_element(By.XPATH, '//*[@id="btnLoginHack"]').click()
sleep(2)
driver.find_element(By.XPATH, '//*[@id="ctl13"]').click()
#loop through table
cell = 3
for row in range(3, 52):
logger.info(f"Looping: {row}")
print(f"Loop #: {row}")
id_cell = driver.find_element(By.XPATH, get_row_xpath(row, cell)[0]).text
referral_id = id_cell.split('\n')[0].strip()
#find row with the correct
if referral_id == referral_id_from_flask:
#Select View Online Referral
select_element = driver.find_element(By.XPATH, get_row_xpath(row, cell)[1])
select = Select(select_element)
select.select_by_visible_text("View Online Referral")
#click go
driver.find_element(By.CSS_SELECTOR, get_row_xpath(row, cell)[2]).click()
#Find select box
select_element = driver.find_element(By.XPATH, """//*[@id="dgProviders_ctl02_ddResponse"]""" )
select = Select(select_element)
select.select_by_value("1")
sleep(0.5)
#comment box
# wait up to 10 seconds for the element to be present and clickable
comment_box = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, '//*[@id="dgProviders_ctl02_txtProviderComments"]'))
)
comment_box.clear()
sleep(0.5) # Add a small delay here
comment_box.send_keys(comment)
sleep(0.5) # Add a small delay here
#click send response
#driver.find_element(By.XPATH, """//*[@id="ButtonBarSendResponse"]""").click()
#add to query
r = session.query(Referrals).filter(Referrals.referral_id == int(referral_id)).first()
r.referral_acceptence_status = "accepted"
session.commit()
session.close()
print('done')
break
if __name__ == "__main__":
referral_id_from_flask = sys.argv[1]
comment = sys.argv[2]
print('hello from program')
main(referral_id_from_flask, comment)
I've already checked the paths to Firefox and GeckoDriver, and I'm sure that they're correct. Additionally, I've tried isolating the scraper script and running it separately from Flask and Celery, and it works without any issues.
However, when I run the scraper with Celery on the production server, I always get the above error. I've tried increasing the log level to 'debug', but the logs don't provide any additional information.
Here's a summary of the steps I've tried so far:
Checked the paths to Firefox and GeckoDriver
Isolated the scraper script and ran it separately from Flask and Celery
Increased the Celery worker log level to 'debug'
I suspect that the issue might be related to the environment and how Flask, Celery, and Selenium interact on the production server. Does anyone have any ideas on how to resolve this issue?
Any help or suggestions would be greatly appreciated. Thanks!