Spaces:

mike23415
/

playwebit-t5-api

Sleeping

File size: 6,095 Bytes

764d4f7
2a3fae3
b7db40a
 
 
764d4f7
2a3fae3
764d4f7
2a3fae3
b7db40a
 
44d7238
 
 
 
 
524f780
764d4f7
b7db40a
 
44d7238
53425a8
 
 
 
44d7238
b7db40a
44d7238
 
 
 
d2d0219
2a3fae3
 
d2d0219
2a3fae3
 
d2d0219
98e82be
 
 
 
3b4df89
2a3fae3
764d4f7
d2d0219
98e82be
764d4f7
2a3fae3
764d4f7
 
98e82be
2a3fae3
 
98e82be
764d4f7
2a3fae3
98e82be
2a3fae3
b7db40a
2a3fae3
98e82be
 
 
b7db40a
98e82be
b7db40a
98e82be
b7db40a
98e82be
b7db40a
 
 
53425a8
44d7238
 
 
 
53425a8
 
 
b7db40a
 
 
 
 
 
 
 
53425a8
b7db40a
98e82be
 
764d4f7
53425a8
b7db40a
2a3fae3
b7db40a
 
 
 
 
 
d2d0219
b7db40a
2a3fae3
b7db40a
 
d2d0219
b7db40a
2a3fae3
98e82be
 
 
b7db40a
98e82be
b7db40a
3b4df89
b7db40a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44d7238
b7db40a
44d7238
3b4df89
53425a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9fd7d89
44d7238

import os
import io
import re
from flask import Flask, request, jsonify
from flask_cors import CORS
from werkzeug.utils import secure_filename
from PyPDF2 import PdfReader
from docx import Document
from pptx import Presentation
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from heapq import nlargest
from collections import defaultdict

app = Flask(__name__)
CORS(app)  # Enable CORS for all routes

# Set NLTK data path to a directory included in the project
nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
os.makedirs(nltk_data_dir, exist_ok=True)
nltk.data.path.append(nltk_data_dir)

# Ensure NLTK data is available (pre-downloaded)
try:
    stopwords.words('english')  # Test if stopwords are accessible
except LookupError:
    print("NLTK data not found. Please ensure 'punkt' and 'stopwords' are pre-downloaded in 'nltk_data'.")
    # Fallback will be used if this fails

# Allowed file extensions
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}

def allowed_file(filename):
    return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS

@app.route("/", methods=["GET"])
def index():
    return "Document Summarizer API is running! Use /summarize endpoint for POST requests."

@app.route("/summarize", methods=["POST"])
def summarize():
    if "file" not in request.files:
        return jsonify({"error": "No file uploaded"}), 400
        
    file = request.files["file"]
    
    if file.filename == "":
        return jsonify({"error": "No selected file"}), 400
        
    if not allowed_file(file.filename):
        return jsonify({"error": "Unsupported file format"}), 400
        
    filename = secure_filename(file.filename)
    file_content = file.read()
    
    # Process file based on type
    text = None
    file_ext = filename.rsplit(".", 1)[1].lower()
    
    try:
        if file_ext == "pdf":
            text = extract_text_from_pdf(file_content)
        elif file_ext == "docx":
            text = extract_text_from_docx(file_content)
        elif file_ext == "pptx":
            text = extract_text_from_pptx(file_content)
        elif file_ext == "txt":
            text = extract_text_from_txt(file_content)
        
        # Generate a summary of the text
        try:
            summary = generate_summary(text)
        except LookupError as e:
            print(f"NLTK summarization failed: {e}. Using fallback.")
            summary = simple_summarize(text)
        except Exception as e:
            print(f"Summarization error: {e}")
            summary = text[:1000] + "..." if len(text) > 1000 else text
        
        # Include metadata
        word_count = len(text.split())
        
        return jsonify({
            "filename": filename,
            "summary": summary,
            "original_word_count": word_count,
            "summary_word_count": len(summary.split()) if summary else 0
        })
    except Exception as e:
        return jsonify({"error": f"Error processing file: {str(e)}"}), 500

# Text extraction functions
def extract_text_from_pdf(file_content):
    reader = PdfReader(io.BytesIO(file_content))
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n\n"
    return clean_text(text)

def extract_text_from_docx(file_content):
    doc = Document(io.BytesIO(file_content))
    text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
    return clean_text(text)

def extract_text_from_pptx(file_content):
    ppt = Presentation(io.BytesIO(file_content))
    text = []
    for slide in ppt.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text") and shape.text.strip():
                text.append(shape.text)
    return clean_text("\n".join(text))

def extract_text_from_txt(file_content):
    text = file_content.decode("utf-8", errors="ignore")
    return clean_text(text)

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s\.\,\!\?\:\;]', '', text)
    return text.strip()

def generate_summary(text, sentence_count=5):
    if len(text.split()) < 100:
        return text
    
    sentences = sent_tokenize(text)
    if len(sentences) <= sentence_count:
        return text
    
    clean_sentences = [s.translate(str.maketrans('', '', string.punctuation)).lower() for s in sentences]
    stop_words = set(stopwords.words('english'))
    
    word_frequencies = defaultdict(int)
    for sentence in clean_sentences:
        for word in word_tokenize(sentence):
            if word not in stop_words:
                word_frequencies[word] += 1
    
    max_frequency = max(word_frequencies.values()) if word_frequencies else 1
    for word in word_frequencies:
        word_frequencies[word] = word_frequencies[word] / max_frequency
    
    sentence_scores = defaultdict(int)
    for i, sentence in enumerate(clean_sentences):
        for word in word_tokenize(sentence):
            if word in word_frequencies:
                sentence_scores[i] += word_frequencies[word]
    
    top_indices = nlargest(sentence_count, sentence_scores, key=sentence_scores.get)
    top_indices.sort()
    
    return ' '.join([sentences[i] for i in top_indices])

def simple_summarize(text, max_chars=1000):
    paragraphs = text.split('\n\n')
    base_summary = ' '.join(paragraphs[:3])
    
    if len(text) <= max_chars:
        return text
    
    if len(base_summary) < max_chars:
        remaining_text = ' '.join(paragraphs[3:])
        sentences = re.split(r'(?<=[.!?])\s+', remaining_text)
        for sentence in sentences:
            if len(base_summary) + len(sentence) + 1 <= max_chars:
                base_summary += ' ' + sentence
            else:
                break
    
    if len(base_summary) > max_chars:
        base_summary = base_summary[:max_chars] + "..."
        
    return base_summary

if __name__ == "__main__":
    # For local testing only
    app.run(host="0.0.0.0", port=7860)