Looping in Python Selenium for scraping content on multiple pages

30 views Asked by At

I am scraping for all of the App.No on this site: WIPO. I am scraping content through this website and I can't seem to loop through the final page I want(100), even though my code does click through to the next page. Also, the content I pull off is just repeated content of the first page. The furthest loop I've gone through is 12 pages before the error message pops up. And for some reason, even with the same code, each time it stops at different pages?

StaleElementReferenceException: Message: stale element reference: element is not attached to the page document

I understand this means that my path can no longer be found, but the website cannot be accessed through changing the url, but I looked through the paths at different pages, and they haven't changed, so I don't know how to move forward. My code looks like this. If someone can help?

class autoScraper():
def __init__(self,ep="./chromedriver",headless=False):
    options = webdriver.ChromeOptions()
    if headless: options.add_argument("--headless");
    options.add_argument("--start-maximized")
    self.driver= webdriver.Chrome(executable_path=ep,options=options);


def closeDriver(self):
    self.driver.close()
    
def next_page(self):
    # btn=self.driver.find_elements_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[2]/div/div[2]/div/a/span')
    btn=self.driver.find_elements_by_css_selector('a[title="Next Page"]')

    if len(btn)>0:
        btn[0].click()
        
def connector(self,a="https://patentscope.wipo.int/search/en/search.jsf"):
    success = False;
    try:
        self.driver.get(a)
        self.driver.find_element_by_xpath('/html/body/div[2]/div[5]/div/div[2]/form/div/div[1]/div[2]/div/div/div[1]/div[2]/button').click()
        self.driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[1]/div/div[1]/div[2]/div/select[1]').click() # cilck to select the num of showing IPs on a page.
        self.driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[1]/div/div[1]/div[2]/div/select[1]/option[4]').click()
        self.driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[1]/div/div[1]/div[1]/div/select[1]/option[2]').click()
        self.driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[1]/div/div[1]/div[2]/div/select[1]/option[4]').click()
        self.driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[1]/div/div[1]/div[3]/div/select[1]/option[2]').click()
        success = True     
    except Exception as e:
        print(e)
    if success:
        return success
    
def getPCT(self):
    PCT = []
    for i in range(1,201):
        no = self.driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[2]/div/div[1]/div/div/table/tbody/tr[%d]/td/div/div[2]/div/div[1]/span[2]/span[2]' %(i)).text
        PCT.append(no)
    
    return PCT   

def clickNextPage(self):
    self.driver.find_element_by_css_selector('a[title="Next Page"]').click()
    
if __name__ == '__main__':
   PCT=[]
   driver = autoScraper()
   if driver.connector():
       sleep(10)
       while i<100:
           i=i+1
           PCT=driver.getPCT()
           driver.clickNextPage()
       driver.next_page()
   print('The num of scraped PCTs:',len(PCT))
   try:
       os.system('mkdir ./download/')
   except:
       print('The directory is already existed.')
   finally:
       with open('./download/pct.txt','a') as f:
           for line in PCT:
               f.write(line+'\n')
       print('urls writen to ./download/pct.txt')
        

   driver.closeDriver()
0

There are 0 answers