Spaces:
Running
Running
File size: 2,524 Bytes
66a4d52 1936100 66a4d52 4a1e457 501ba1b 1936100 501ba1b 66a4d52 4a1e457 501ba1b 4a1e457 501ba1b 66a4d52 4a1e457 501ba1b 4a1e457 501ba1b 66a4d52 501ba1b 1936100 501ba1b 4a1e457 f175faf 4a1e457 501ba1b 4a1e457 501ba1b 4a1e457 1936100 4a1e457 66a4d52 4a1e457 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
# Use a more lightweight model for Hugging Face Spaces
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
def scrape_website(url):
"""Extracts text from a website with error handling"""
try:
headers = {'User-Agent': 'Mozilla/5.0'} # Add headers to prevent 403 errors
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status() # Raise HTTP errors
soup = BeautifulSoup(response.text, "html.parser")
# Extract text from common content-containing tags
text_elements = soup.find_all(['p', 'article', 'main', 'section'])
text = " ".join([e.get_text(strip=True, separator=' ') for e in text_elements])
return text if text.strip() else "No content found"
except Exception as e:
return f"Scraping Error: {str(e)}"
def summarize_website(url):
"""Handles the full summarization pipeline"""
try:
extracted_text = scrape_website(url)
if "Error" in extracted_text:
return extracted_text
# Check minimum text length
if len(extracted_text.split()) < 50:
return "Error: Insufficient content for summarization (minimum 50 words required)"
# Truncate text to model's max input length (1024 tokens for DistilBART)
max_input_length = 1000 # Conservative estimate for token count
truncated_text = extracted_text[:max_input_length]
# Generate summary
summary = summarizer(
truncated_text,
max_length=200,
min_length=50,
do_sample=False,
truncation=True # Ensure truncation is enabled
)
return f"**Summary:**\n\n{summary[0]['summary_text']}"
except Exception as e:
return f"Summarization Error: {str(e)}"
# Gradio interface with improved configuration
iface = gr.Interface(
fn=summarize_website,
inputs=gr.Textbox(label="Website URL", placeholder="Enter full URL (including https://)..."),
outputs=gr.Markdown(),
title="AI-Powered Website Summarizer",
description="Enter a website URL to get an AI-generated summary of its content",
examples=[
["https://en.wikipedia.org/wiki/Large_language_model"],
["https://www.bbc.com/news/technology-66510295"]
]
)
iface.launch() |