How do I make it so all arrays are the same length?

59 views Asked by At

I am trying to build a python web scraper to look for contact info on websites and then save that info in excel. Well it wont really work and I tried multiple options. Here is the code

import requests
from bs4 import BeautifulSoup
import pandas as pd
from openpyxl import load_workbook
import re

import re

def scrape_contact_data(url):
    # Function definition for scraping contact data
    print("Scraping contact data from:", url)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all text content within the webpage
    all_text = soup.get_text(separator="\n", strip=True)
    
    # Initialize lists to store data
    company_names = []
    street_numbers = []
    postal_codes = []
    countries = []
    email_addresses = []
    phone_numbers = []
    fax_numbers = []
    websites = []
    
    # Search for patterns indicative of contact information
    # Email addresses pattern
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    email_addresses.extend(re.findall(email_pattern, all_text))
    print("Number of email addresses found:", len(email_addresses))
    
    # Phone numbers pattern
    phone_pattern = r'(?:(?:\+|0{0,2})\d{1,4}[\s.-]?)?(?:\(\d{1,5}\)[\s.-]?)?\d{3,5}[\s.-]?\d{3,4}[\s.-]?\d{3,4}'
    phone_numbers.extend(re.findall(phone_pattern, all_text))
    print("Number of phone numbers found:", len(phone_numbers))
    
    # Fax numbers pattern
    fax_pattern = r'Fax:\s*([\d\s/-]+)'
    fax_numbers.extend(re.findall(fax_pattern, all_text))
    print("Number of fax numbers found:", len(fax_numbers))
    
    # Address pattern (assuming typical address format)
    address_pattern = r'\b\d{1,5}\s+[\w\s]+\b'
    addresses = re.findall(address_pattern, all_text)
    for address in addresses:
        # Split address into street number and name
        street_number, street_name = address.split(maxsplit=1)
        street_numbers.append(street_number)
        # Extract postal code and country from the last part of the address
        postal_code_country_pattern = r'(\d{4,})\s+([\w\s]+)$'
        match = re.search(postal_code_country_pattern, street_name)
        if match:
            postal_codes.append(match.group(1))
            countries.append(match.group(2))
        else:
            postal_codes.append('-')
            countries.append('-')
    print("Number of addresses found:", len(street_numbers))
    print("Number of postal codes found:", len(postal_codes))
    print("Number of countries found:", len(countries))
    
    # Assuming company name is not readily available in the webpage content
    company_names.extend(['-'] * len(email_addresses))
    
    # Assuming website URLs are not readily available in the webpage content
    websites.extend(['-'] * len(email_addresses))
    
    # Create a DataFrame to store the data
    data = {
        'Company Name': company_names,
        'Street & House number': street_numbers,
        'Postal Code': postal_codes,
        'Country': countries,
        'Email Address': email_addresses,
        'Phone Number': phone_numbers,
        'Fax Number': fax_numbers,
        'Website': websites
    }
    df = pd.DataFrame(data)
    
    return df

def extract_email_from_subpage(subpage_url):
    try:
        response = requests.get(subpage_url)
        response.raise_for_status()  
        soup = BeautifulSoup(response.text, 'html.parser')

        contact_block = soup.find('div', class_='cb-orte-item-adresse')

        if contact_block:
            email_element = contact_block.find('span', class_='data-emailencrypted').find_next('a', class_='value')
            email = email_element.get('href').split(':')[-1] if email_element else "Nicht gefunden"
            return email
        else:
            print("Kontaktblock nicht gefunden auf der Seite:", subpage_url)
            return "-"
    except Exception as e:
        print("Fehler beim Abrufen der Seite:", e)
        return "-"

def save_to_excel(df, filename='contact_data.xlsx'):
    # Function definition for saving DataFrame to Excel
    
    try:
        # Load existing Excel file
        wb = load_workbook(filename)
        # Select the active worksheet
        ws = wb.active
        # Find the next empty row in the worksheet
        next_row = ws.max_row + 1
        
        # Write the DataFrame to the Excel file starting from the next empty row
        for index, row in df.iterrows():
            for col_idx, value in enumerate(row, start=1):
                ws.cell(row=next_row+index, column=col_idx, value=value)
        
        # Save the updated workbook
        wb.save(filename)
        print("Data appended to", filename)
    
    except FileNotFoundError:
        # If the file doesn't exist, create a new Excel file
        df.to_excel(filename, index=False)
        print("New file created:", filename)

if __name__ == "__main__":
    urls = []
    while True:
        url = input("Enter a URL (or type 'done' to finish): ")
        if url.lower() == 'done':
            break
        urls.append(url)
    
    if not urls:
        print("No URLs provided. Exiting.")
    else:
        # Initialize an empty DataFrame to store all contact data
        all_contact_data = pd.DataFrame()
        
        # Iterate through each URL, scrape contact data, and concatenate to the DataFrame
        for url in urls:
            print("Scraping data from:", url)
            try:
                contact_df = scrape_contact_data(url)
                all_contact_data = pd.concat([all_contact_data, contact_df], ignore_index=True)
            except Exception as e:
                print(f"Error occurred while scraping data from {url}: {e}")
        
        # Specify the location where you want to save the Excel file
        save_location = "C:\\Users\\-----\\Desktop\\contact_data.xlsx"  # Replace with your desired location
        
        # Save all contact data to the specified location
        save_to_excel(all_contact_data, filename=save_location)
       

This is the output : Number of email addresses found: 0 Number of phone numbers found: 0 Number of fax numbers found: 0 Number of addresses found: 5 Number of postal codes found: 5 Number of countries found: 5 Error occurred while scraping data from "censored for obv reasons": All arrays must be of the same length

I wanted it to spit out the info from a block of info like:

Name Address Zip Code and Town Phone Fax email

1

There are 1 answers

0
Ammar On

Get each element containing person's contact details in a list.

    # driver_table contains details about all the drivers
    driver_table = soup.find('table', attrs = {'class': 'standing-table__table'})
    table_body = driver_table.find('tbody')
    # Get all the rows of driver details. Each row contain individual driver details
    rows = table_body.find_all("tr", attrs = {'class':'standing-table__row'})

Iterate over the list and extract the all contact details of a person (name, email, etc) one at a time. Use Try and Catch statement to extract each detail, if any detail is missing, append "" (blank) to its list.

    try:
        name.append(cells[1].text.strip()) # First td element is of name
    except:
        # Appending "" so the length of the list remains the same in case of missing data
        name.append("")

With this if there is any missing information, a blank space would be appended and hence the length of all the contact details list will be the same. Example code:

from bs4 import BeautifulSoup 
import requests 
import pandas as pd

r = requests.get("https://www.skysports.com/f1/standings")
soup = BeautifulSoup(r.content, 'html.parser')
name = []
team = []
country = []
driver_table = soup.find('table', attrs = {'class': 'standing-table__table'})
table_body = driver_table.find('tbody')
# Get all the rows of driver details
rows = table_body.find_all("tr", attrs = {'class':'standing-table__row'}) 

for row in rows:
    # Get all the elements with driver details having the same class
    cells = row.find_all("td", attrs = {'class':'standing-table__cell'})
    try:
        name.append(cells[1].text.strip()) # First td element is of name
    except:
        # Appending "" so the length of the list remains the same in case of missing data
        name.append("")
    try:
        country.append(cells[2].text.strip()) # Second is of country
    except:
        country.append("")
    try:
        team.append(cells[3].text.strip()) # Third is of teams
    except:
        team.append("")
data = {
    'Name': name, 'Country': country, 'Team': team
}
df = pd.DataFrame(data)
print(df)