File size: 4,713 Bytes
764d4f7
2a3fae3
80d0b8a
b7db40a
764d4f7
2a3fae3
764d4f7
2a3fae3
a911ba5
524f780
a911ba5
80d0b8a
 
 
dc17435
764d4f7
b7db40a
dc17435
 
012dc5b
 
a911ba5
012dc5b
dc17435
012dc5b
 
 
 
 
d2d0219
2a3fae3
d2d0219
2a3fae3
798ae00
2a3fae3
d2d0219
a911ba5
 
 
dc17435
 
 
a911ba5
 
 
 
 
 
 
 
 
 
 
 
 
 
dc17435
92d0377
98e82be
 
dc17435
80d0b8a
dc17435
98e82be
3b4df89
2a3fae3
dc17435
80d0b8a
dc17435
764d4f7
80d0b8a
d2d0219
798ae00
dc17435
764d4f7
80d0b8a
764d4f7
dc17435
2a3fae3
80d0b8a
2a3fae3
798ae00
764d4f7
2a3fae3
 
dc17435
98e82be
 
a911ba5
98e82be
a911ba5
98e82be
a911ba5
98e82be
a911ba5
dc17435
 
798ae00
dc17435
 
798ae00
dc17435
798ae00
92d0377
dc17435
98e82be
80d0b8a
98e82be
764d4f7
92d0377
dc17435
 
 
 
d2d0219
92d0377
dc17435
 
 
 
d2d0219
92d0377
dc17435
 
 
 
 
 
 
 
b7db40a
92d0377
dc17435
 
53425a8
9fd7d89
798ae00
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
import io
import logging
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
from PyPDF2 import PdfReader
from docx import Document
from pptx import Presentation
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize Flask app
app = Flask(__name__)

# Set Hugging Face cache directory
os.environ["HF_HOME"] = "/app/hf_cache"

# Load T5 model and tokenizer
logger.info("Loading T5-Base model...")
try:
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    logger.info("T5-Base model loaded successfully.")
except Exception as e:
    logger.error(f"Failed to load T5-Base: {str(e)}")
    raise

ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}

def allowed_file(filename):
    """Check if the uploaded file has an allowed extension."""
    return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS

def summarize_text(text, max_length=150, min_length=30):
    """Summarize text using T5-Base."""
    try:
        if not text.strip():
            return "No text found in the document to summarize."

        input_text = "summarize: " + text
        inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
        summary_ids = model.generate(
            inputs["input_ids"],
            max_length=max_length,
            min_length=min_length,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True
        )
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary
    except Exception as e:
        logger.error(f"Error in T5 summarization: {str(e)}")
        return "Error summarizing text."

@app.route("/", methods=["GET"])
def index():
    """Root endpoint."""
    logger.info("Root endpoint accessed.")
    return "Document Summarizer API with T5-Base is running! Use /summarize endpoint for POST requests."

@app.route("/summarize", methods=["POST"])
def summarize():
    """Handle file uploads and summarization."""
    logger.info("Summarize endpoint called.")
    
    if "file" not in request.files:
        logger.error("No file uploaded.")
        return jsonify({"error": "No file uploaded"}), 400
    
    file = request.files["file"]
    if file.filename == "":
        logger.error("No file selected.")
        return jsonify({"error": "No selected file"}), 400
    
    if not allowed_file(file.filename):
        logger.error(f"Unsupported file format: {file.filename}")
        return jsonify({"error": "Unsupported file format"}), 400

    filename = secure_filename(file.filename)
    file_content = file.read()
    file_ext = filename.rsplit(".", 1)[1].lower()
    
    try:
        if file_ext == "pdf":
            text = summarize_pdf(file_content)
        elif file_ext == "docx":
            text = summarize_docx(file_content)
        elif file_ext == "pptx":
            text = summarize_pptx(file_content)
        elif file_ext == "txt":
            text = summarize_txt(file_content)
        else:
            return jsonify({"error": "Unsupported file format"}), 400

        if not text.strip():
            return jsonify({"error": "No extractable text found in the document"}), 400

        summary = summarize_text(text)
        logger.info(f"File {filename} summarized successfully.")
        return jsonify({"filename": filename, "summary": summary})
    
    except Exception as e:
        logger.error(f"Error processing file {filename}: {str(e)}")
        return jsonify({"error": f"Error processing file: {str(e)}"}), 500

def summarize_pdf(file_content):
    """Extract text from PDF."""
    reader = PdfReader(io.BytesIO(file_content))
    text = "\n".join([page.extract_text() or "" for page in reader.pages])
    return text.strip()

def summarize_docx(file_content):
    """Extract text from DOCX."""
    doc = Document(io.BytesIO(file_content))
    text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
    return text.strip()

def summarize_pptx(file_content):
    """Extract text from PPTX."""
    ppt = Presentation(io.BytesIO(file_content))
    text = []
    for slide in ppt.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text") and shape.text.strip():
                text.append(shape.text.strip())
    return "\n".join(text).strip()

def summarize_txt(file_content):
    """Extract text from TXT file."""
    return file_content.decode("utf-8").strip()

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860, debug=True)