I am trying to scrape data from Yellow Pages. I used this scraper many times, but it has recently stopped.
Got this error
'NoneType' object has no attribute 'group' 0 results found
can anyone please help me to fix this problem
Where am I going wrong on this?
import requests
import requests_random_user_agent
import urllib.parse
from bs4 import BeautifulSoup
import re
from math import ceil
import csv
import os
import sys
import subprocess
from os import system, name
import time
from tqdm import tqdm
class Scraper:
def __init__(self,keyword,location):
self.keyword=keyword
self.location=location
self.params = urllib.parse.urlencode({"search_terms": self.keyword, "geo_location_terms": self.location})
def get_info(self, link):
try:
r = requests.get(link)
html = BeautifulSoup(r.content, "html.parser")
except:
return False
try:
name = html.find('h1').text
except:
name = None
try:
phone = html.find(class_='phone').text
except:
phone = None
try:
website = html.find('a',class_='primary-btn website-link')["href"]
if len(website.split("?")) > 1:
website = website.split("?")[0]
except:
website = None
try:
email = html.find('a', class_='email-business')["href"].split(":")[1]
except:
email=None
try:
address = html.find('h2',class_='address').text
except:
address=None
return {"name": name, "email": email, "phone": phone, "address": address, "website":website}
def get_num_pages(self):
try:
url = f"https://www.yellowpages.com/search?{self.params}"
response = requests.get(url)
html = BeautifulSoup(response.content, "html.parser")
pagination = html.find(class_="pagination")
if not pagination:
pagination = 1
links = html.select("a[class='business-name']")
num_results = 0
for l in links:
try:
l["data-analytics"]
num_results += 1
except:
continue
return num_results, pagination
num_results = int(re.search('We found(.*?)results',pagination.text).group(1))
return num_results, int(ceil(int(num_results) / 30))
except Exception as e:
print(e)
return False, False
def get_links(self, page):
try:
url = f"https://www.yellowpages.com/search?{self.params}&page={page}"
response = requests.request("GET", url, timeout=10)
html = BeautifulSoup(response.content, "html.parser")
links = html.select("a[class='business-name']")
links_filtered=[]
for l in links:
try:
l["data-analytics"]
links_filtered.append(l)
except:
continue
links_list = []
for link in links_filtered:
links_list.append(f"https://www.yellowpages.com{link['href']}")
return links_list
except Exception as e:
print(e)
return []
def open_file(filename):
try:
if sys.platform == "win32":
os.startfile(filename)
else:
opener = "open" if sys.platform == "darwin" else "xdg-open"
subprocess.call([opener, filename])
except:
return False
def create_csv(elements):
row_list = [["Name", "Address", "Phone", "Email", "Website"]]
for e in elements:
name = e["name"]
address = e["address"]
phone = e["phone"]
email = e["email"]
website = e["website"]
row_list.append([name, address, phone, email, website])
with open('output.csv', 'w', newline='', encoding='utf8') as file:
writer = csv.writer(file)
writer.writerows(row_list)
def clear():
# for windows
if name == 'nt':
_ = system('cls')
# for mac and linux(here, os.name is 'posix')
else:
_ = system('clear')
def main():
clear()
try:
while True:
keyword = input("Keyword: ")
if keyword != "":
break
while True:
city = input("City: ")
if city != "":
break
clear()
scraper = Scraper(keyword, city)
results, num_pages = scraper.get_num_pages()
if not results:
print("0 results found")
return False
print(f"{results} results found {keyword} - {city}")
data = []
pages = tqdm(range(1, num_pages + 1))
for page in pages:
clear()
try:
pages.set_description(f"Scraping page {page}/{num_pages}...")
links = scraper.get_links(page)
if not (len(links) > 0):
continue
links = tqdm(links)
for link in links:
try:
links.set_description(f"Scraping {link}")
info = scraper.get_info(link)
# print(info)
data.append(info)
create_csv(data)
except:
continue
except:
continue
print("Opening file...")
open_file("output.csv")
print("Task finished")
except:
return False
if __name__ == "__main__":
main()
It fails on the line
A very simple check of the search results page, by opening the browser, would have shown you that the text "We found x results" is not present on the page. So
re.search
returnsNone
, even if there are many results.Adjust your script to work without
num_pages
and only paginate via the page links at the bottom or by incrementing thepage=
parameter in the URL until no more results/pages are listed.FYI, next time, put in some minimal debugging effort and not post your entire script.