How could I scrap the data from any website like an AI using python

117 views Asked by At

I want to scrap the text from the website and bucket it according to my need. Want to do it with python using google ai/ml services

I have tried from scratch:

import requests

from bs4 import BeautifulSoup

def scrape_website(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    else:
        print(f"Error: Unable to fetch the URL. Status code: {response.status_code}")
        return None

def extract_information(soup, query):

    # Your HTML parsing logic here to extract information based on the query

    # For demonstration, let's extract the title of the page

    if query.lower() == "project name":
        project_name = soup.title.text.strip()
        return f"Project Name: {project_name}"
    else:
        return "Query not supported."

if __name__ == "__main__":

    url = input("Enter the URL: ")
    
    # Scrape website content
    webpage_content = scrape_website(url)

    if webpage_content:
        while True:
            query = input("Enter your question (e.g., 'Project Name', 'Status'): ")

            if query.lower() == "exit":
                break

            result = extract_information(webpage_content, query)
            print(result)

this above code gives me the output given below but its not up to my expectations:

Enter the URL: https://h2v.eu/hydrogen-valleys/crystal-brook-hydrogen-superhub

Enter your question (e.g., 'Project Name', 'Status'): project name

Project Name: Hydrogen valleys | Crystal Brook Hydrogen Superhub

Enter your question (e.g., 'Project Name', 'Status'): status

Query not supported.

I have also tried:

import tkinter as tk

from tkinter import ttk

from bs4 import BeautifulSoup

from google.cloud import language_v1

import requests

def scrape_and_analyze(url):

    # Web scraping
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        text_content = soup.get_text()
    except Exception as e:
        return f"Error in web scraping: {str(e)}"

    # Google Cloud Natural Language API
    try:
        client = language_v1.LanguageServiceClient()
        document = language_v1.Document(content=text_content, type_=language_v1.Document.Type.PLAIN_TEXT)
        annotations = client.analyze_entities(document=document)
        entities = annotations.entities
    except Exception as e:
        return f"Error in text analysis: {str(e)}"

    # Filter entities of interest (customize this based on your needs)
    filtered_entities = [entity.name for entity in entities if entity.type_ == language_v1.Entity.Type.PERSON]

    return filtered_entities

def on_submit():

    url = url_entry.get()

    result = scrape_and_analyze(url)

    result_text.delete(1.0, tk.END)

    result_text.insert(tk.END, "\n".join(result))


# UI Setup
root = tk.Tk()

root.title("Web Scraping and Text Analysis")

# URL Entry
url_label = ttk.Label(root, text="Enter URL:")

url_label.pack(pady=10)

url_entry = ttk.Entry(root, width=50)

url_entry.pack(pady=10)

# Submit Button
submit_button = ttk.Button(root, text="Submit", command=on_submit)

submit_button.pack(pady=10)

# Result Text
result_text = tk.Text(root, height=10, width=50, wrap="word")

result_text.pack(pady=10)

# END

root.mainloop()

This is also giving error.

1

There are 1 answers

1
Ritesh On

Web scraping involves extracting data from websites, and it can be a useful tool. Always check a website's robots.txt file and terms of service before scraping.

Here's a simple example using Python with the BeautifulSoup library for HTML parsing and requests for making HTTP requests. Make sure to install these libraries first:

pip install beautifulsoup4
pip install requests

Now, you can use the following example as a starting point for web scraping:

import requests
from bs4 import BeautifulSoup

def scrape_website(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Example: Extract all the links on the page
        links = soup.find_all('a')
        for link in links:
            print(link.get('href'))

    else:
        print(f"Error: Unable to retrieve the page. Status code: {response.status_code}")

# Example usage
url_to_scrape = 'https://example.com'
scrape_website(url_to_scrape)

Remember:

  • Respect terms of service and robots.txt.
  • Web scraping should be done responsibly and ethically. Avoid putting too much load on a server, and consider incorporating delays in your code.
  • Some websites may have measures in place to prevent or limit scraping. Respect these measures.
  • Websites can change their structure, so your scraping code might need updates if the website's layout changes.