rahul7star's picture
Update app.py
1c23bd1 verified
import os
import google.generativeai as genai
import requests
import streamlit as st
from bs4 import BeautifulSoup
import pandas as pd
import asyncio
import aiohttp
from fpdf import FPDF
# Set the page configuration as the first command in the app
st.set_page_config(page_title="AI Web Scraper", page_icon=":robot_face:", layout="wide")
# Configure generative AI
genai.configure(api_key=os.environ["AI_API_KEY"])
BASE_URL = "https://transcripts.sl.nsw.gov.au"
async def fetch_html(session, url):
"""Fetch and parse the HTML content from a given URL asynchronously."""
try:
headers = {'User-Agent': 'Mozilla/5.0'}
async with session.get(url, headers=headers) as response:
response.raise_for_status()
return await response.text()
except Exception as e:
print(f"Error fetching {url}: {e}")
return None
async def parse_shortlink(soup):
"""Extract the shortlink (node ID) from the parsed HTML."""
shortlink_tag = soup.find("link", rel="shortlink")
shortlink = shortlink_tag["href"].strip() if shortlink_tag and "href" in shortlink_tag.attrs else None
print(f"Extracted Shortlink: {shortlink}")
return shortlink
async def extract_page_data(session, soup, limit=10):
"""Extract title, URL, and node ID for the given page content."""
data = []
# Extract page title
title_meta = soup.find("meta", attrs={"name": "dcterms.title"})
page_title = title_meta["content"].strip() if title_meta else "Unknown Title"
# Extract links and node IDs
for row in soup.find_all("div", class_="views-row")[:limit]:
a_tag = row.find("a")
if a_tag:
url = BASE_URL + a_tag.get("href", "").strip()
node_soup = await fetch_html(session, url)
if node_soup:
node_soup = BeautifulSoup(node_soup, "html.parser")
node_id = await parse_shortlink(node_soup)
data.append([page_title, url, node_id])
return pd.DataFrame(data, columns=["Page Title", "URL", "NodeId"])
async def traverse_links(session, url, visited=set(), limit=None):
"""Recursively traverse through series, document, and pages URLs extracting shortlinks asynchronously."""
if url in visited:
print(f"Skipping already visited URL: {url}")
return pd.DataFrame()
visited.add(url)
print(f"Traversing URL: {url}")
soup = await fetch_html(session, url)
if not soup:
return pd.DataFrame()
soup = BeautifulSoup(soup, "html.parser")
page_data = await extract_page_data(session, soup, limit)
recursive_data = pd.DataFrame(columns=["Page Title", "URL", "NodeId"])
# Recursively traverse deeper levels
tasks = []
for link in soup.find_all("a", href=True):
href = link["href"].strip()
if href.startswith(("/series/", "/document/", "/pages/")):
full_url = BASE_URL + href
tasks.append(traverse_links(session, full_url, visited, limit))
results = await asyncio.gather(*tasks)
for result in results:
recursive_data = pd.concat([recursive_data, result], ignore_index=True)
return pd.concat([page_data, recursive_data], ignore_index=True)
async def process_url_field(session, table_df, limit=None, field="document"):
"""Process the 'document' or 'page' URLs from the provided table and fetch their node IDs asynchronously."""
data = []
tasks = []
for url in table_df["URL"]:
tasks.append(process_url_field_helper(session, url, field))
results = await asyncio.gather(*tasks)
for result in results:
data.extend(result)
return pd.DataFrame(data, columns=["URL", "NodeId"]).drop_duplicates(subset=["URL"])
async def process_url_field_helper(session, url, field):
"""Helper to process each URL asynchronously."""
data = []
node_soup = await fetch_html(session, url)
if node_soup:
node_soup = BeautifulSoup(node_soup, "html.parser")
node_id = await parse_shortlink(node_soup)
# Find all hrefs that contain the given field (document or page) and process each
for link in node_soup.find_all("a", href=True):
href = link["href"].strip()
if field in href:
new_url = BASE_URL + href # Prepend the base URL to create a full URL
print(f"Found '{field}' link: {new_url}")
new_node_soup = await fetch_html(session, new_url)
if new_node_soup:
new_node_soup = BeautifulSoup(new_node_soup, "html.parser")
new_node_id = await parse_shortlink(new_node_soup)
data.append([new_url, new_node_id])
data.append([url, node_id]) # Add the original URL and its node ID
return data
async def combine_data_frames(df1, df2, df3):
"""Combine multiple dataframes and maintain the series -> document -> page hierarchy."""
combined_data = pd.concat([df1, df2, df3], ignore_index=True)
combined_data = combined_data.drop_duplicates(subset=["URL"])
# Add columns to maintain the hierarchy
combined_data['Page Title'] = combined_data['Page Title'].fillna("Unknown Title")
combined_data['NodeId'] = combined_data['NodeId'].fillna("Unknown Node ID")
return combined_data[['Page Title', 'URL', 'NodeId']]
async def main():
st.title("Custom Scrape")
url = st.text_input("Enter the URL to scrape:")
if st.button("Scrape"):
print("Starting the scraping process...")
async with aiohttp.ClientSession() as session:
# Step 1: Scrape initial data and create Table 1 (series links)
result_df = await traverse_links(session, url)
if not result_df.empty:
# Step 2: Fetch and display additional data using the 'URL' field from result_df (Table 2 - document links)
additional_data_df = await process_url_field(session, result_df, field="document")
if not additional_data_df.empty:
# Step 3: Now fetch data from 'pages' field in Table 2 and create Table 3
table_3_df = await process_url_field(session, additional_data_df, field="page")
if not table_3_df.empty:
# Combine all the tables (result_df, additional_data_df, and table_3_df)
final_combined_df = await combine_data_frames(result_df, additional_data_df, table_3_df)
# Only show the final combined dataframe
st.subheader("Final Combined Data (Series -> Document -> Page Hierarchy)")
st.dataframe(final_combined_df) # Display the combined data
else:
st.error("No data found for the 'pages' URLs.")
else:
st.error("No data found for the 'document' URLs.")
else:
st.error("Failed to fetch data or no shortlinks found.")
print("Scraping process completed.")
if __name__ == "__main__":
asyncio.run(main())