I want to scrap the text from the website and bucket it according to my need. Want to do it with python using google ai/ml services
I have tried from scratch:
import requests
from bs4 import BeautifulSoup
def scrape_website(url):
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
return soup
else:
print(f"Error: Unable to fetch the URL. Status code: {response.status_code}")
return None
def extract_information(soup, query):
# Your HTML parsing logic here to extract information based on the query
# For demonstration, let's extract the title of the page
if query.lower() == "project name":
project_name = soup.title.text.strip()
return f"Project Name: {project_name}"
else:
return "Query not supported."
if __name__ == "__main__":
url = input("Enter the URL: ")
# Scrape website content
webpage_content = scrape_website(url)
if webpage_content:
while True:
query = input("Enter your question (e.g., 'Project Name', 'Status'): ")
if query.lower() == "exit":
break
result = extract_information(webpage_content, query)
print(result)
this above code gives me the output given below but its not up to my expectations:
Enter the URL: https://h2v.eu/hydrogen-valleys/crystal-brook-hydrogen-superhub
Enter your question (e.g., 'Project Name', 'Status'): project name
Project Name: Hydrogen valleys | Crystal Brook Hydrogen Superhub
Enter your question (e.g., 'Project Name', 'Status'): status
Query not supported.
I have also tried:
import tkinter as tk
from tkinter import ttk
from bs4 import BeautifulSoup
from google.cloud import language_v1
import requests
def scrape_and_analyze(url):
# Web scraping
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
text_content = soup.get_text()
except Exception as e:
return f"Error in web scraping: {str(e)}"
# Google Cloud Natural Language API
try:
client = language_v1.LanguageServiceClient()
document = language_v1.Document(content=text_content, type_=language_v1.Document.Type.PLAIN_TEXT)
annotations = client.analyze_entities(document=document)
entities = annotations.entities
except Exception as e:
return f"Error in text analysis: {str(e)}"
# Filter entities of interest (customize this based on your needs)
filtered_entities = [entity.name for entity in entities if entity.type_ == language_v1.Entity.Type.PERSON]
return filtered_entities
def on_submit():
url = url_entry.get()
result = scrape_and_analyze(url)
result_text.delete(1.0, tk.END)
result_text.insert(tk.END, "\n".join(result))
# UI Setup
root = tk.Tk()
root.title("Web Scraping and Text Analysis")
# URL Entry
url_label = ttk.Label(root, text="Enter URL:")
url_label.pack(pady=10)
url_entry = ttk.Entry(root, width=50)
url_entry.pack(pady=10)
# Submit Button
submit_button = ttk.Button(root, text="Submit", command=on_submit)
submit_button.pack(pady=10)
# Result Text
result_text = tk.Text(root, height=10, width=50, wrap="word")
result_text.pack(pady=10)
# END
root.mainloop()
This is also giving error.
Web scraping involves extracting data from websites, and it can be a useful tool. Always check a website's
robots.txt
file and terms of service before scraping.Here's a simple example using Python with the
BeautifulSoup
library for HTML parsing andrequests
for making HTTP requests. Make sure to install these libraries first:Now, you can use the following example as a starting point for web scraping:
Remember:
robots.txt
.