File size: 2,524 Bytes
66a4d52
 
 
1936100
66a4d52
4a1e457
501ba1b
1936100
 
501ba1b
66a4d52
4a1e457
501ba1b
4a1e457
501ba1b
66a4d52
4a1e457
 
501ba1b
 
4a1e457
 
501ba1b
66a4d52
501ba1b
1936100
 
501ba1b
 
4a1e457
 
 
 
f175faf
4a1e457
 
 
501ba1b
4a1e457
 
 
 
 
 
 
 
 
 
 
 
 
 
 
501ba1b
4a1e457
1936100
4a1e457
 
 
 
 
 
 
 
 
 
 
 
66a4d52
4a1e457
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import gradio as gr
import requests
from bs4 import BeautifulSoup
from transformers import pipeline

# Use a more lightweight model for Hugging Face Spaces
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

def scrape_website(url):
    """Extracts text from a website with error handling"""
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}  # Add headers to prevent 403 errors
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise HTTP errors
        
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract text from common content-containing tags
        text_elements = soup.find_all(['p', 'article', 'main', 'section'])
        text = " ".join([e.get_text(strip=True, separator=' ') for e in text_elements])
        
        return text if text.strip() else "No content found"
        
    except Exception as e:
        return f"Scraping Error: {str(e)}"

def summarize_website(url):
    """Handles the full summarization pipeline"""
    try:
        extracted_text = scrape_website(url)
        
        if "Error" in extracted_text:
            return extracted_text
            
        # Check minimum text length
        if len(extracted_text.split()) < 50:
            return "Error: Insufficient content for summarization (minimum 50 words required)"
            
        # Truncate text to model's max input length (1024 tokens for DistilBART)
        max_input_length = 1000  # Conservative estimate for token count
        truncated_text = extracted_text[:max_input_length]
        
        # Generate summary
        summary = summarizer(
            truncated_text,
            max_length=200,
            min_length=50,
            do_sample=False,
            truncation=True  # Ensure truncation is enabled
        )
        
        return f"**Summary:**\n\n{summary[0]['summary_text']}"
        
    except Exception as e:
        return f"Summarization Error: {str(e)}"

# Gradio interface with improved configuration
iface = gr.Interface(
    fn=summarize_website,
    inputs=gr.Textbox(label="Website URL", placeholder="Enter full URL (including https://)..."),
    outputs=gr.Markdown(),
    title="AI-Powered Website Summarizer",
    description="Enter a website URL to get an AI-generated summary of its content",
    examples=[
        ["https://en.wikipedia.org/wiki/Large_language_model"],
        ["https://www.bbc.com/news/technology-66510295"]
    ]
)

iface.launch()