Spaces:
Running
Running
File size: 7,271 Bytes
89fe3ed 2684c1b 21a1b20 5e81026 21a1b20 20abe67 84ad005 6c7f724 1c23bd1 efce92b 6c7f724 db4b991 c9b6fac 6c7f724 2684c1b 6c7f724 91dfa4f 21a1b20 81c8cb7 91dfa4f 6c7f724 91dfa4f 6db56a6 538d80f 21a1b20 6c7f724 1c23bd1 6c7f724 84ad005 6c7f724 1c23bd1 6c7f724 84ad005 6c7f724 91dfa4f 6c7f724 91dfa4f 6c7f724 1c23bd1 6c7f724 1c23bd1 6c7f724 6a75f5d 1c23bd1 6c7f724 1c23bd1 c9b6fac 20abe67 20577f1 1c23bd1 6c7f724 1c23bd1 6c7f724 1c23bd1 55ba155 21a1b20 1c23bd1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import os
import google.generativeai as genai
import requests
import streamlit as st
from bs4 import BeautifulSoup
import pandas as pd
import asyncio
import aiohttp
from fpdf import FPDF
# Set the page configuration as the first command in the app
st.set_page_config(page_title="AI Web Scraper", page_icon=":robot_face:", layout="wide")
# Configure generative AI
genai.configure(api_key=os.environ["AI_API_KEY"])
BASE_URL = "https://transcripts.sl.nsw.gov.au"
async def fetch_html(session, url):
"""Fetch and parse the HTML content from a given URL asynchronously."""
try:
headers = {'User-Agent': 'Mozilla/5.0'}
async with session.get(url, headers=headers) as response:
response.raise_for_status()
return await response.text()
except Exception as e:
print(f"Error fetching {url}: {e}")
return None
async def parse_shortlink(soup):
"""Extract the shortlink (node ID) from the parsed HTML."""
shortlink_tag = soup.find("link", rel="shortlink")
shortlink = shortlink_tag["href"].strip() if shortlink_tag and "href" in shortlink_tag.attrs else None
print(f"Extracted Shortlink: {shortlink}")
return shortlink
async def extract_page_data(session, soup, limit=10):
"""Extract title, URL, and node ID for the given page content."""
data = []
# Extract page title
title_meta = soup.find("meta", attrs={"name": "dcterms.title"})
page_title = title_meta["content"].strip() if title_meta else "Unknown Title"
# Extract links and node IDs
for row in soup.find_all("div", class_="views-row")[:limit]:
a_tag = row.find("a")
if a_tag:
url = BASE_URL + a_tag.get("href", "").strip()
node_soup = await fetch_html(session, url)
if node_soup:
node_soup = BeautifulSoup(node_soup, "html.parser")
node_id = await parse_shortlink(node_soup)
data.append([page_title, url, node_id])
return pd.DataFrame(data, columns=["Page Title", "URL", "NodeId"])
async def traverse_links(session, url, visited=set(), limit=None):
"""Recursively traverse through series, document, and pages URLs extracting shortlinks asynchronously."""
if url in visited:
print(f"Skipping already visited URL: {url}")
return pd.DataFrame()
visited.add(url)
print(f"Traversing URL: {url}")
soup = await fetch_html(session, url)
if not soup:
return pd.DataFrame()
soup = BeautifulSoup(soup, "html.parser")
page_data = await extract_page_data(session, soup, limit)
recursive_data = pd.DataFrame(columns=["Page Title", "URL", "NodeId"])
# Recursively traverse deeper levels
tasks = []
for link in soup.find_all("a", href=True):
href = link["href"].strip()
if href.startswith(("/series/", "/document/", "/pages/")):
full_url = BASE_URL + href
tasks.append(traverse_links(session, full_url, visited, limit))
results = await asyncio.gather(*tasks)
for result in results:
recursive_data = pd.concat([recursive_data, result], ignore_index=True)
return pd.concat([page_data, recursive_data], ignore_index=True)
async def process_url_field(session, table_df, limit=None, field="document"):
"""Process the 'document' or 'page' URLs from the provided table and fetch their node IDs asynchronously."""
data = []
tasks = []
for url in table_df["URL"]:
tasks.append(process_url_field_helper(session, url, field))
results = await asyncio.gather(*tasks)
for result in results:
data.extend(result)
return pd.DataFrame(data, columns=["URL", "NodeId"]).drop_duplicates(subset=["URL"])
async def process_url_field_helper(session, url, field):
"""Helper to process each URL asynchronously."""
data = []
node_soup = await fetch_html(session, url)
if node_soup:
node_soup = BeautifulSoup(node_soup, "html.parser")
node_id = await parse_shortlink(node_soup)
# Find all hrefs that contain the given field (document or page) and process each
for link in node_soup.find_all("a", href=True):
href = link["href"].strip()
if field in href:
new_url = BASE_URL + href # Prepend the base URL to create a full URL
print(f"Found '{field}' link: {new_url}")
new_node_soup = await fetch_html(session, new_url)
if new_node_soup:
new_node_soup = BeautifulSoup(new_node_soup, "html.parser")
new_node_id = await parse_shortlink(new_node_soup)
data.append([new_url, new_node_id])
data.append([url, node_id]) # Add the original URL and its node ID
return data
async def combine_data_frames(df1, df2, df3):
"""Combine multiple dataframes and maintain the series -> document -> page hierarchy."""
combined_data = pd.concat([df1, df2, df3], ignore_index=True)
combined_data = combined_data.drop_duplicates(subset=["URL"])
# Add columns to maintain the hierarchy
combined_data['Page Title'] = combined_data['Page Title'].fillna("Unknown Title")
combined_data['NodeId'] = combined_data['NodeId'].fillna("Unknown Node ID")
return combined_data[['Page Title', 'URL', 'NodeId']]
async def main():
st.title("Custom Scrape")
url = st.text_input("Enter the URL to scrape:")
if st.button("Scrape"):
print("Starting the scraping process...")
async with aiohttp.ClientSession() as session:
# Step 1: Scrape initial data and create Table 1 (series links)
result_df = await traverse_links(session, url)
if not result_df.empty:
# Step 2: Fetch and display additional data using the 'URL' field from result_df (Table 2 - document links)
additional_data_df = await process_url_field(session, result_df, field="document")
if not additional_data_df.empty:
# Step 3: Now fetch data from 'pages' field in Table 2 and create Table 3
table_3_df = await process_url_field(session, additional_data_df, field="page")
if not table_3_df.empty:
# Combine all the tables (result_df, additional_data_df, and table_3_df)
final_combined_df = await combine_data_frames(result_df, additional_data_df, table_3_df)
# Only show the final combined dataframe
st.subheader("Final Combined Data (Series -> Document -> Page Hierarchy)")
st.dataframe(final_combined_df) # Display the combined data
else:
st.error("No data found for the 'pages' URLs.")
else:
st.error("No data found for the 'document' URLs.")
else:
st.error("Failed to fetch data or no shortlinks found.")
print("Scraping process completed.")
if __name__ == "__main__":
asyncio.run(main()) |