WebScraping doesnt work, even without error

36 views Asked by At

I want to design python web scraping code to scrape these data (https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page). Here is the code:

import os
import requests
import random
import time
import pyarrow.parquet as pq
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from fake_useragent import UserAgent

# URL de la page contenant les liens vers les datasets
base_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/"
response = requests.get(base_url)
soup = BeautifulSoup(response.text, "html.parser")

# Chemin où enregistrer les fichiers
download_directory = "C:/Users/flosr/Engineering/Blent.ai Project/datas"

# Fonction pour télécharger un fichier avec un en-tête utilisateur aléatoire et une pause aléatoire
def download_file(url, file_path):
    user_agent = UserAgent().random
    headers = {"User-Agent": user_agent}
    time.sleep(random.uniform(1, 3))  # Ajouter une pause aléatoire entre 1 et 3 secondes
    response = requests.get(url, headers=headers)
    with open(file_path, "w") as f:
        f.write(response.content)

# Parcourir chaque section contenant les liens pour chaque année
for section in soup.find_all("div", class_="faq-answers"):
    year = section.find_previous_sibling("div", class_="faq-questions").text.strip()
    print(f"Downloading datasets for year {year}...")
    
    # Créer un sous-répertoire pour chaque année
    year_directory = os.path.join(download_directory, year)
    os.makedirs(year_directory, exist_ok=True)
    
    # Télécharger les fichiers pour chaque mois de l'année
    for link in section.find_all("a"):
        file_url = urljoin(base_url, link.get("href"))
        filename = os.path.basename(file_url)
        file_path = os.path.join(year_directory, filename)
        
        # Télécharger le fichier
        print(f"Downloading {filename}...")
        download_file(file_url, file_path)
        
        # Convertir le fichier Parquet
        pq.write_table(pq.read_table(file_path), file_path.replace('.parquet', '.csv'))

print("Download and conversion complete.")

Here is the output :

PS C:\Users\flosr\Engineering\Blent.ai Project\datas\WebScraping Code>  & 'c:\Users\flosr\Engineering\Blent.ai Project\datas\WebScraping Code\env\Scripts\python.exe' 'c:\Users\flosr\.vscode\extensions\ms-python.debugpy-2024.2.0-win32-x64\bundled\libs\debugpy\adapter/../..\debugpy\launcher' '63645' '--' 'C:\Users\flosr\Engineering\Blent.ai Project\datas\WebScraping Code\env\main.py' 
Download and conversion complete.

However, nothing appears in the said directory. No error appearing but it still doesnt work. and for some reason it never stops installing dependencies below without any end.

cant try anything if i dont have any error appearing to know whats the problem

1

There are 1 answers

0
Andrej Kesely On

Seems that some URLs have whitespace character at the end that needs to be stripped:

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

headers = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:124.0) Gecko/20100101 Firefox/124.0"
}


def save_url(url, path):
    response = requests.get(url, headers=headers, stream=True)
    total_size = int(response.headers.get("content-length", 0))

    with tqdm(total=total_size, unit="B", unit_scale=True) as progress_bar:
        with open(path, "wb") as file:
            for data in response.iter_content(1024):
                progress_bar.update(len(data))
                file.write(data)


url = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
soup = BeautifulSoup(requests.get(url).content, "html.parser")

for a in soup.select("table a"):
    month = a.find_previous("strong").get_text(strip=True)
    year = a.find_previous(attrs={"data-answer": True}).get_text(strip=True)

    u = a["href"].strip()             # <-- important part!

    path = f'{year}_{month}_{u.split("/")[-1]}'
    print(year, month, u, f"Saving to {path}...")
    save_url(u, path)
    print("\n", "-" * 80)

Prints:

...

100%|█████████████| 50.0M/50.0M [00:01<00:00, 37.1MB/s]
                                                                                                                                                                                                                   
 --------------------------------------------------------------------------------                                                                                                                                  
2024 January https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-01.parquet Saving to 2024_January_green_tripdata_2024-01.parquet...
100%|█████████████| 1.36M/1.36M [00:00<00:00, 5.32MB/s]
                                                                                                                                                                                                                   
 --------------------------------------------------------------------------------                                                                                                                                  
2024 January https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2024-01.parquet Saving to 2024_January_fhv_tripdata_2024-01.parquet...
100%|█████████████| 15.0M/15.0M [00:00<00:00, 42.1MB/s]
                                                                                                                                                                                                                   
 --------------------------------------------------------------------------------                                                                                                                                  
2024 January https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-01.parquet Saving to 2024_January_fhvhv_tripdata_2024-01.parquet...
100%|█████████████| 473M/473M [00:15<00:00, 31.0MB/s]
                                                                                                                                                                                                                   
 --------------------------------------------------------------------------------                                                                                                                                  
2023 January https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet Saving to 2023_January_yellow_tripdata_2023-01.parquet...
100%|█████████████| 47.7M/47.7M [00:01<00:00, 42.4MB/s]
                                                                                                                                                                                                                   
 --------------------------------------------------------------------------------                                                                                                                                  
2023 January https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet Saving to 2023_January_green_tripdata_2023-01.parquet...
100%|█████████████| 1.43M/1.43M [00:00<00:00, 46.7MB/s]
                                                                                                                                                                                                                   
 --------------------------------------------------------------------------------                                                                                                                                  

...