Spaces:
Running
Running
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
from transformers import pipeline, AutoTokenizer | |
# Load summarization pipeline | |
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") | |
tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6") | |
def scrape_website(url): | |
"""Enhanced extraction with metadata support""" | |
try: | |
headers = {'User-Agent': 'Mozilla/5.0'} | |
response = requests.get(url, headers=headers, timeout=10) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, "html.parser") | |
# Extract title and meta description | |
title = soup.title.string.strip() if soup.title else "" | |
meta_desc = soup.find("meta", attrs={"name": "description"}) | |
meta_desc = meta_desc["content"].strip() if meta_desc else "" | |
# Extract main text content | |
text_elements = soup.find_all(['p', 'article', 'main', 'section']) | |
text = " ".join([e.get_text(strip=True, separator=' ') for e in text_elements]) | |
full_content = f"{title}\n{meta_desc}\n{text}".strip() | |
return full_content if full_content else "No meaningful content found." | |
except Exception as e: | |
return f"Scraping Error: {str(e)}" | |
def truncate_text(text, max_tokens=1024): | |
"""Properly truncates text at the token level.""" | |
tokens = tokenizer.tokenize(text) | |
return tokenizer.convert_tokens_to_string(tokens[:max_tokens]) | |
def summarize_website(url): | |
"""Updated function with real-time status""" | |
try: | |
extracted_text = scrape_website(url) | |
if "Error" in extracted_text: | |
return "β " + extracted_text | |
if len(extracted_text.split()) < 50: | |
return "β οΈ Error: Insufficient content for summarization (minimum 50 words required)" | |
truncated_text = truncate_text(extracted_text) | |
summary = summarizer( | |
truncated_text, | |
max_length=250, # Increased summary length | |
min_length=80, # Ensuring more detailed output | |
do_sample=False | |
) | |
return f"## π Summary\n\n{summary[0]['summary_text']}" | |
except Exception as e: | |
return f"β Summarization Error: {str(e)}" | |
# Custom CSS for better mobile experience | |
css = """ | |
@media screen and (max-width: 600px) { | |
.container { padding: 10px !important; } | |
.input-box textarea { font-size: 18px !important; } | |
.gr-button { width: 100% !important; } | |
} | |
""" | |
# Mobile-optimized interface with real-time updates | |
with gr.Blocks(theme=gr.themes.Soft(), css=css, title="Website Summarizer") as app: | |
gr.Markdown("# π AI Website Summarizer") | |
gr.Markdown("Paste any website URL below to get an instant AI-powered summary!") | |
with gr.Row(): | |
url_input = gr.Textbox( | |
label="Website URL", | |
placeholder="Enter full URL (https://...)", | |
lines=1, | |
max_lines=1, | |
elem_id="input-box" | |
) | |
with gr.Row(): | |
submit_btn = gr.Button("Generate Summary π", variant="primary") | |
clear_btn = gr.Button("Clear π") | |
status = gr.Markdown("π Ready for input...", elem_id="status-msg") | |
output = gr.Markdown() | |
gr.Examples( | |
examples=[ | |
["https://en.wikipedia.org/wiki/Large_language_model"], | |
["https://www.bbc.com/news/technology-66510295"] | |
], | |
inputs=url_input, | |
label="Try these examples:", | |
examples_per_page=2 | |
) | |
submit_btn.click( | |
fn=summarize_website, | |
inputs=url_input, | |
outputs=[output], | |
api_name="summarize" | |
) | |
clear_btn.click( | |
fn=lambda: ("", "π Ready for input..."), | |
inputs=None, | |
outputs=[url_input, status], | |
queue=False | |
) | |
# Mobile-friendly deployment | |
app.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
favicon_path="https://www.svgrepo.com/show/355037/huggingface.svg" | |
) | |