Spaces:

adithya747
/

website-summarizer

Running

App Files Files Community

website-summarizer / app.py

adithya747

Update app.py

4a1e457 verified 3 months ago

raw

history blame

2.52 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	from transformers import pipeline

	# Use a more lightweight model for Hugging Face Spaces
	summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

	def scrape_website(url):
	"""Extracts text from a website with error handling"""
	try:
	headers = {'User-Agent': 'Mozilla/5.0'} # Add headers to prevent 403 errors
	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status() # Raise HTTP errors

	soup = BeautifulSoup(response.text, "html.parser")

	# Extract text from common content-containing tags
	text_elements = soup.find_all(['p', 'article', 'main', 'section'])
	text = " ".join([e.get_text(strip=True, separator=' ') for e in text_elements])

	return text if text.strip() else "No content found"

	except Exception as e:
	return f"Scraping Error: {str(e)}"

	def summarize_website(url):
	"""Handles the full summarization pipeline"""
	try:
	extracted_text = scrape_website(url)

	if "Error" in extracted_text:
	return extracted_text

	# Check minimum text length
	if len(extracted_text.split()) < 50:
	return "Error: Insufficient content for summarization (minimum 50 words required)"

	# Truncate text to model's max input length (1024 tokens for DistilBART)
	max_input_length = 1000 # Conservative estimate for token count
	truncated_text = extracted_text[:max_input_length]

	# Generate summary
	summary = summarizer(
	truncated_text,
	max_length=200,
	min_length=50,
	do_sample=False,
	truncation=True # Ensure truncation is enabled
	)

	return f"Summary:\n\n{summary[0]['summary_text']}"

	except Exception as e:
	return f"Summarization Error: {str(e)}"

	# Gradio interface with improved configuration
	iface = gr.Interface(
	fn=summarize_website,
	inputs=gr.Textbox(label="Website URL", placeholder="Enter full URL (including https://)..."),
	outputs=gr.Markdown(),
	title="AI-Powered Website Summarizer",
	description="Enter a website URL to get an AI-generated summary of its content",
	examples=[
	["https://en.wikipedia.org/wiki/Large_language_model"],
	["https://www.bbc.com/news/technology-66510295"]
	]
	)

	iface.launch()