I'm building a web scraper with Flask, Celery, and Selenium. The scraper runs without any issues on my local machine, but when I try to run it on the production server behind NGINX, I get the following error:

selenium.common.exceptions.WebDriverException: Message: Process unexpectedly closed with status 1

Scraper:

from selenium.webdriver.firefox.service import Service
import sys, os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
from dotenv import load_dotenv
from check_email import EmailedReferrals, Referrals
from sqlalchemy import create_engine, Column, Integer, String, Date, Text
from sqlalchemy.orm import declarative_base 
from sqlalchemy.orm import sessionmaker
from datetime import datetime
from check_email import EmailedReferrals, Referrals
import logging

logger = logging.getLogger('celery')
logger.setLevel(logging.DEBUG)


load_dotenv()

env_conf = os.getenv("CONFIG")
Base = declarative_base()

if env_conf == "dev":    
    driver = webdriver.Firefox()
    DATABASE_URL = os.getenv('DEV_DATABASE_URL')    
else: 
    # Set up Firefox options
    options = Options()
    # Set window size
    options.add_argument("--headless=new") 
    options.add_argument("--start-maximized")
    options.add_argument("--window-size=1920, 1080")
    #options.add_argument("user-agent= Chrome/111.0.5563.64") # need to update firefox here
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-extensions")
    options.add_argument("--disable-gpu")
    options.add_argument('--disable-dev-shm-usage')    
    
    
    # Include the path to the Firefox binary
    firefox_binary_path = '/usr/bin/firefox'
    geckodriver_path = '/home/fusion/bin/geckodriver'
    options.binary_location = firefox_binary_path
    service = Service(executable_path=geckodriver_path)
    driver = webdriver.Firefox(options=options)
    DATABASE_URL = os.getenv('PROD_DATABASE_URL')
engine = create_engine(DATABASE_URL)

Base.metadata.create_all(engine)

Session = sessionmaker(bind=engine)
session = Session()

def get_row_xpath(row, cell): #pulls xpath of rows and cells from Incoming Referrals View
    ctl = str(row).zfill(2)
    action = row - 3
    id_cell = f"""/html/body/form/table/tbody/tr[4]/td[2]/table/tbody/tr/td/table/tbody/tr[2]/td/div/table/tbody\
        /tr/td[3]/div/table/tbody/tr[2]/td/div/table/tbody/tr[{row}]/td[{cell}]"""
    select_element = f"""//*[@id="ucViewGrid_dgView_ctl{ctl}_Actions_{action}_0_0_ActionItems"]"""
    go_button = f"""#ucViewGrid_dgView_ctl{ctl}_Actions_{action}_0_0_ActionButton > input:nth-child(1)"""
    return id_cell, select_element, go_button


def main(referral_id_from_flask, comment):
    print('hello from main')
    logger.info("hello from main")
    driver.get("https://example.com")
    #Click Account and sign in
    driver.find_element(By.XPATH, '//*[@id="UserNameTextBox"]').send_keys(os.getenv('CAREPORT_USERNAME'))
    driver.find_element(By.XPATH, '//*[@id="PasswordTextBox"]').send_keys(os.getenv('CAREPORT_PASSWORD'))
    driver.find_element(By.XPATH, '//*[@id="btnLoginHack"]').click()
    sleep(2)
    driver.find_element(By.XPATH, '//*[@id="ctl13"]').click()
    #loop through table
    cell = 3
    for row in range(3, 52):
        logger.info(f"Looping: {row}")
        print(f"Loop #: {row}")
        id_cell = driver.find_element(By.XPATH, get_row_xpath(row, cell)[0]).text
        referral_id = id_cell.split('\n')[0].strip()
        #find row with the correct
        if referral_id == referral_id_from_flask:
            #Select View Online Referral
            select_element = driver.find_element(By.XPATH, get_row_xpath(row, cell)[1])
            select = Select(select_element)
            select.select_by_visible_text("View Online Referral")
            #click go
            
            driver.find_element(By.CSS_SELECTOR, get_row_xpath(row, cell)[2]).click()
            
            #Find select box            
            select_element = driver.find_element(By.XPATH, """//*[@id="dgProviders_ctl02_ddResponse"]""" )
            select = Select(select_element)
            select.select_by_value("1")
            sleep(0.5)
            #comment box
            # wait up to 10 seconds for the element to be present and clickable
            comment_box = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, '//*[@id="dgProviders_ctl02_txtProviderComments"]'))
            )
            comment_box.clear()
            sleep(0.5) # Add a small delay here
            comment_box.send_keys(comment)
            sleep(0.5) # Add a small delay here
            #click send response
            #driver.find_element(By.XPATH, """//*[@id="ButtonBarSendResponse"]""").click()
            #add to query
            r = session.query(Referrals).filter(Referrals.referral_id == int(referral_id)).first()
            r.referral_acceptence_status = "accepted"
            session.commit()
            session.close()
            print('done')
            break

if __name__ == "__main__":
    
    referral_id_from_flask = sys.argv[1]
    comment = sys.argv[2]
    print('hello from program')
    main(referral_id_from_flask, comment)

I've already checked the paths to Firefox and GeckoDriver, and I'm sure that they're correct. Additionally, I've tried isolating the scraper script and running it separately from Flask and Celery, and it works without any issues.

However, when I run the scraper with Celery on the production server, I always get the above error. I've tried increasing the log level to 'debug', but the logs don't provide any additional information.

Here's a summary of the steps I've tried so far:

Checked the paths to Firefox and GeckoDriver
Isolated the scraper script and ran it separately from Flask and Celery
Increased the Celery worker log level to 'debug'

I suspect that the issue might be related to the environment and how Flask, Celery, and Selenium interact on the production server. Does anyone have any ideas on how to resolve this issue?

Any help or suggestions would be greatly appreciated. Thanks!

0

There are 0 answers