Web-Scrapper-3Steps

Running

File size: 7,271 Bytes

89fe3ed
2684c1b
21a1b20
5e81026
21a1b20
20abe67
84ad005
6c7f724
1c23bd1
efce92b
6c7f724
db4b991
c9b6fac
6c7f724
2684c1b
 
6c7f724
 
91dfa4f
 
21a1b20
81c8cb7
91dfa4f
 
6c7f724
91dfa4f
6db56a6
538d80f
21a1b20
6c7f724
 
 
 
1c23bd1
6c7f724
84ad005
6c7f724
 
 
1c23bd1
6c7f724
 
 
 
 
 
84ad005
 
6c7f724
 
 
 
 
 
91dfa4f
6c7f724
91dfa4f
6c7f724
 
 
1c23bd1
6c7f724
 
 
1c23bd1
6c7f724
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a75f5d
1c23bd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c7f724
1c23bd1
c9b6fac
20abe67
20577f1
1c23bd1
 
6c7f724
 
 
 
 
1c23bd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c7f724
 
 
1c23bd1
55ba155
21a1b20
1c23bd1

import os
import google.generativeai as genai
import requests
import streamlit as st
from bs4 import BeautifulSoup
import pandas as pd
import asyncio
import aiohttp
from fpdf import FPDF

# Set the page configuration as the first command in the app
st.set_page_config(page_title="AI Web Scraper", page_icon=":robot_face:", layout="wide")

# Configure generative AI
genai.configure(api_key=os.environ["AI_API_KEY"])

BASE_URL = "https://transcripts.sl.nsw.gov.au"

async def fetch_html(session, url):
    """Fetch and parse the HTML content from a given URL asynchronously."""
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        async with session.get(url, headers=headers) as response:
            response.raise_for_status()
            return await response.text()
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

async def parse_shortlink(soup):
    """Extract the shortlink (node ID) from the parsed HTML."""
    shortlink_tag = soup.find("link", rel="shortlink")
    shortlink = shortlink_tag["href"].strip() if shortlink_tag and "href" in shortlink_tag.attrs else None
    print(f"Extracted Shortlink: {shortlink}")
    return shortlink

async def extract_page_data(session, soup, limit=10):
    """Extract title, URL, and node ID for the given page content."""
    data = []
    
    # Extract page title
    title_meta = soup.find("meta", attrs={"name": "dcterms.title"})
    page_title = title_meta["content"].strip() if title_meta else "Unknown Title"
    
    # Extract links and node IDs
    for row in soup.find_all("div", class_="views-row")[:limit]:
        a_tag = row.find("a")
        if a_tag:
            url = BASE_URL + a_tag.get("href", "").strip()
            node_soup = await fetch_html(session, url)
            if node_soup:
                node_soup = BeautifulSoup(node_soup, "html.parser")
                node_id = await parse_shortlink(node_soup)
                data.append([page_title, url, node_id])
    
    return pd.DataFrame(data, columns=["Page Title", "URL", "NodeId"])

async def traverse_links(session, url, visited=set(), limit=None):
    """Recursively traverse through series, document, and pages URLs extracting shortlinks asynchronously."""
    if url in visited:
        print(f"Skipping already visited URL: {url}")
        return pd.DataFrame()
    
    visited.add(url)
    print(f"Traversing URL: {url}")
    soup = await fetch_html(session, url)
    if not soup:
        return pd.DataFrame()
    
    soup = BeautifulSoup(soup, "html.parser")
    page_data = await extract_page_data(session, soup, limit)
    recursive_data = pd.DataFrame(columns=["Page Title", "URL", "NodeId"])
    
    # Recursively traverse deeper levels
    tasks = []
    for link in soup.find_all("a", href=True):
        href = link["href"].strip()
        if href.startswith(("/series/", "/document/", "/pages/")):
            full_url = BASE_URL + href
            tasks.append(traverse_links(session, full_url, visited, limit))
    
    results = await asyncio.gather(*tasks)
    
    for result in results:
        recursive_data = pd.concat([recursive_data, result], ignore_index=True)
    
    return pd.concat([page_data, recursive_data], ignore_index=True)

async def process_url_field(session, table_df, limit=None, field="document"):
    """Process the 'document' or 'page' URLs from the provided table and fetch their node IDs asynchronously."""
    data = []
    
    tasks = []
    for url in table_df["URL"]:
        tasks.append(process_url_field_helper(session, url, field))
    
    results = await asyncio.gather(*tasks)
    
    for result in results:
        data.extend(result)
    
    return pd.DataFrame(data, columns=["URL", "NodeId"]).drop_duplicates(subset=["URL"])

async def process_url_field_helper(session, url, field):
    """Helper to process each URL asynchronously."""
    data = []
    node_soup = await fetch_html(session, url)
    if node_soup:
        node_soup = BeautifulSoup(node_soup, "html.parser")
        node_id = await parse_shortlink(node_soup)
        
        # Find all hrefs that contain the given field (document or page) and process each
        for link in node_soup.find_all("a", href=True):
            href = link["href"].strip()
            if field in href:
                new_url = BASE_URL + href  # Prepend the base URL to create a full URL
                print(f"Found '{field}' link: {new_url}")
                new_node_soup = await fetch_html(session, new_url)
                if new_node_soup:
                    new_node_soup = BeautifulSoup(new_node_soup, "html.parser")
                    new_node_id = await parse_shortlink(new_node_soup)
                    data.append([new_url, new_node_id])
        
        data.append([url, node_id])  # Add the original URL and its node ID
    return data

async def combine_data_frames(df1, df2, df3):
    """Combine multiple dataframes and maintain the series -> document -> page hierarchy."""
    combined_data = pd.concat([df1, df2, df3], ignore_index=True)
    combined_data = combined_data.drop_duplicates(subset=["URL"])
    
    # Add columns to maintain the hierarchy
    combined_data['Page Title'] = combined_data['Page Title'].fillna("Unknown Title")
    combined_data['NodeId'] = combined_data['NodeId'].fillna("Unknown Node ID")
    
    return combined_data[['Page Title', 'URL', 'NodeId']]

async def main():
    st.title("Custom Scrape")
    
    url = st.text_input("Enter the URL to scrape:")
    if st.button("Scrape"):
        print("Starting the scraping process...") 
        
        async with aiohttp.ClientSession() as session:
            # Step 1: Scrape initial data and create Table 1 (series links)
            result_df = await traverse_links(session, url)
            
            if not result_df.empty:
                # Step 2: Fetch and display additional data using the 'URL' field from result_df (Table 2 - document links)
                additional_data_df = await process_url_field(session, result_df, field="document")
                if not additional_data_df.empty:
                    
                    # Step 3: Now fetch data from 'pages' field in Table 2 and create Table 3
                    table_3_df = await process_url_field(session, additional_data_df, field="page")
                    if not table_3_df.empty:
                        
                        # Combine all the tables (result_df, additional_data_df, and table_3_df)
                        final_combined_df = await combine_data_frames(result_df, additional_data_df, table_3_df)
                        
                        # Only show the final combined dataframe
                        st.subheader("Final Combined Data (Series -> Document -> Page Hierarchy)")
                        st.dataframe(final_combined_df)  # Display the combined data
                    else:
                        st.error("No data found for the 'pages' URLs.")
                else:
                    st.error("No data found for the 'document' URLs.")
            else:
                st.error("Failed to fetch data or no shortlinks found.")
        
        print("Scraping process completed.")

if __name__ == "__main__":
    asyncio.run(main())