Spaces:
Running
Running
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
from transformers import pipeline | |
# Use a more lightweight model for Hugging Face Spaces | |
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") | |
def scrape_website(url): | |
"""Extracts text from a website with error handling""" | |
try: | |
headers = {'User-Agent': 'Mozilla/5.0'} # Add headers to prevent 403 errors | |
response = requests.get(url, headers=headers, timeout=10) | |
response.raise_for_status() # Raise HTTP errors | |
soup = BeautifulSoup(response.text, "html.parser") | |
# Extract text from common content-containing tags | |
text_elements = soup.find_all(['p', 'article', 'main', 'section']) | |
text = " ".join([e.get_text(strip=True, separator=' ') for e in text_elements]) | |
return text if text.strip() else "No content found" | |
except Exception as e: | |
return f"Scraping Error: {str(e)}" | |
def summarize_website(url): | |
"""Handles the full summarization pipeline""" | |
try: | |
extracted_text = scrape_website(url) | |
if "Error" in extracted_text: | |
return extracted_text | |
# Check minimum text length | |
if len(extracted_text.split()) < 50: | |
return "Error: Insufficient content for summarization (minimum 50 words required)" | |
# Truncate text to model's max input length (1024 tokens for DistilBART) | |
max_input_length = 1000 # Conservative estimate for token count | |
truncated_text = extracted_text[:max_input_length] | |
# Generate summary | |
summary = summarizer( | |
truncated_text, | |
max_length=200, | |
min_length=50, | |
do_sample=False, | |
truncation=True # Ensure truncation is enabled | |
) | |
return f"**Summary:**\n\n{summary[0]['summary_text']}" | |
except Exception as e: | |
return f"Summarization Error: {str(e)}" | |
# Gradio interface with improved configuration | |
iface = gr.Interface( | |
fn=summarize_website, | |
inputs=gr.Textbox(label="Website URL", placeholder="Enter full URL (including https://)..."), | |
outputs=gr.Markdown(), | |
title="AI-Powered Website Summarizer", | |
description="Enter a website URL to get an AI-generated summary of its content", | |
examples=[ | |
["https://en.wikipedia.org/wiki/Large_language_model"], | |
["https://www.bbc.com/news/technology-66510295"] | |
] | |
) | |
iface.launch() |