adithya747's picture
Update app.py
4a1e457 verified
raw
history blame
2.52 kB
import gradio as gr
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
# Use a more lightweight model for Hugging Face Spaces
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
def scrape_website(url):
"""Extracts text from a website with error handling"""
try:
headers = {'User-Agent': 'Mozilla/5.0'} # Add headers to prevent 403 errors
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status() # Raise HTTP errors
soup = BeautifulSoup(response.text, "html.parser")
# Extract text from common content-containing tags
text_elements = soup.find_all(['p', 'article', 'main', 'section'])
text = " ".join([e.get_text(strip=True, separator=' ') for e in text_elements])
return text if text.strip() else "No content found"
except Exception as e:
return f"Scraping Error: {str(e)}"
def summarize_website(url):
"""Handles the full summarization pipeline"""
try:
extracted_text = scrape_website(url)
if "Error" in extracted_text:
return extracted_text
# Check minimum text length
if len(extracted_text.split()) < 50:
return "Error: Insufficient content for summarization (minimum 50 words required)"
# Truncate text to model's max input length (1024 tokens for DistilBART)
max_input_length = 1000 # Conservative estimate for token count
truncated_text = extracted_text[:max_input_length]
# Generate summary
summary = summarizer(
truncated_text,
max_length=200,
min_length=50,
do_sample=False,
truncation=True # Ensure truncation is enabled
)
return f"**Summary:**\n\n{summary[0]['summary_text']}"
except Exception as e:
return f"Summarization Error: {str(e)}"
# Gradio interface with improved configuration
iface = gr.Interface(
fn=summarize_website,
inputs=gr.Textbox(label="Website URL", placeholder="Enter full URL (including https://)..."),
outputs=gr.Markdown(),
title="AI-Powered Website Summarizer",
description="Enter a website URL to get an AI-generated summary of its content",
examples=[
["https://en.wikipedia.org/wiki/Large_language_model"],
["https://www.bbc.com/news/technology-66510295"]
]
)
iface.launch()