import os import google.generativeai as genai import requests import streamlit as st from bs4 import BeautifulSoup import pandas as pd import asyncio import aiohttp from fpdf import FPDF # Set the page configuration as the first command in the app st.set_page_config(page_title="AI Web Scraper", page_icon=":robot_face:", layout="wide") # Configure generative AI genai.configure(api_key=os.environ["AI_API_KEY"]) BASE_URL = "https://transcripts.sl.nsw.gov.au" async def fetch_html(session, url): """Fetch and parse the HTML content from a given URL asynchronously.""" try: headers = {'User-Agent': 'Mozilla/5.0'} async with session.get(url, headers=headers) as response: response.raise_for_status() return await response.text() except Exception as e: print(f"Error fetching {url}: {e}") return None async def parse_shortlink(soup): """Extract the shortlink (node ID) from the parsed HTML.""" shortlink_tag = soup.find("link", rel="shortlink") shortlink = shortlink_tag["href"].strip() if shortlink_tag and "href" in shortlink_tag.attrs else None print(f"Extracted Shortlink: {shortlink}") return shortlink async def extract_page_data(session, soup, limit=10): """Extract title, URL, and node ID for the given page content.""" data = [] # Extract page title title_meta = soup.find("meta", attrs={"name": "dcterms.title"}) page_title = title_meta["content"].strip() if title_meta else "Unknown Title" # Extract links and node IDs for row in soup.find_all("div", class_="views-row")[:limit]: a_tag = row.find("a") if a_tag: url = BASE_URL + a_tag.get("href", "").strip() node_soup = await fetch_html(session, url) if node_soup: node_soup = BeautifulSoup(node_soup, "html.parser") node_id = await parse_shortlink(node_soup) data.append([page_title, url, node_id]) return pd.DataFrame(data, columns=["Page Title", "URL", "NodeId"]) async def traverse_links(session, url, visited=set(), limit=None): """Recursively traverse through series, document, and pages URLs extracting shortlinks asynchronously.""" if url in visited: print(f"Skipping already visited URL: {url}") return pd.DataFrame() visited.add(url) print(f"Traversing URL: {url}") soup = await fetch_html(session, url) if not soup: return pd.DataFrame() soup = BeautifulSoup(soup, "html.parser") page_data = await extract_page_data(session, soup, limit) recursive_data = pd.DataFrame(columns=["Page Title", "URL", "NodeId"]) # Recursively traverse deeper levels tasks = [] for link in soup.find_all("a", href=True): href = link["href"].strip() if href.startswith(("/series/", "/document/", "/pages/")): full_url = BASE_URL + href tasks.append(traverse_links(session, full_url, visited, limit)) results = await asyncio.gather(*tasks) for result in results: recursive_data = pd.concat([recursive_data, result], ignore_index=True) return pd.concat([page_data, recursive_data], ignore_index=True) async def process_url_field(session, table_df, limit=None, field="document"): """Process the 'document' or 'page' URLs from the provided table and fetch their node IDs asynchronously.""" data = [] tasks = [] for url in table_df["URL"]: tasks.append(process_url_field_helper(session, url, field)) results = await asyncio.gather(*tasks) for result in results: data.extend(result) return pd.DataFrame(data, columns=["URL", "NodeId"]).drop_duplicates(subset=["URL"]) async def process_url_field_helper(session, url, field): """Helper to process each URL asynchronously.""" data = [] node_soup = await fetch_html(session, url) if node_soup: node_soup = BeautifulSoup(node_soup, "html.parser") node_id = await parse_shortlink(node_soup) # Find all hrefs that contain the given field (document or page) and process each for link in node_soup.find_all("a", href=True): href = link["href"].strip() if field in href: new_url = BASE_URL + href # Prepend the base URL to create a full URL print(f"Found '{field}' link: {new_url}") new_node_soup = await fetch_html(session, new_url) if new_node_soup: new_node_soup = BeautifulSoup(new_node_soup, "html.parser") new_node_id = await parse_shortlink(new_node_soup) data.append([new_url, new_node_id]) data.append([url, node_id]) # Add the original URL and its node ID return data async def combine_data_frames(df1, df2, df3): """Combine multiple dataframes and maintain the series -> document -> page hierarchy.""" combined_data = pd.concat([df1, df2, df3], ignore_index=True) combined_data = combined_data.drop_duplicates(subset=["URL"]) # Add columns to maintain the hierarchy combined_data['Page Title'] = combined_data['Page Title'].fillna("Unknown Title") combined_data['NodeId'] = combined_data['NodeId'].fillna("Unknown Node ID") return combined_data[['Page Title', 'URL', 'NodeId']] async def main(): st.title("Custom Scrape") url = st.text_input("Enter the URL to scrape:") if st.button("Scrape"): print("Starting the scraping process...") async with aiohttp.ClientSession() as session: # Step 1: Scrape initial data and create Table 1 (series links) result_df = await traverse_links(session, url) if not result_df.empty: # Step 2: Fetch and display additional data using the 'URL' field from result_df (Table 2 - document links) additional_data_df = await process_url_field(session, result_df, field="document") if not additional_data_df.empty: # Step 3: Now fetch data from 'pages' field in Table 2 and create Table 3 table_3_df = await process_url_field(session, additional_data_df, field="page") if not table_3_df.empty: # Combine all the tables (result_df, additional_data_df, and table_3_df) final_combined_df = await combine_data_frames(result_df, additional_data_df, table_3_df) # Only show the final combined dataframe st.subheader("Final Combined Data (Series -> Document -> Page Hierarchy)") st.dataframe(final_combined_df) # Display the combined data else: st.error("No data found for the 'pages' URLs.") else: st.error("No data found for the 'document' URLs.") else: st.error("Failed to fetch data or no shortlinks found.") print("Scraping process completed.") if __name__ == "__main__": asyncio.run(main())