File size: 4,754 Bytes
764d4f7
2a3fae3
80d0b8a
b7db40a
764d4f7
2a3fae3
764d4f7
2a3fae3
a911ba5
524f780
a911ba5
80d0b8a
 
 
dc17435
764d4f7
b7db40a
dc17435
 
012dc5b
 
a911ba5
012dc5b
dc17435
012dc5b
 
 
 
 
d2d0219
2a3fae3
d2d0219
2a3fae3
798ae00
2a3fae3
d2d0219
a911ba5
 
 
dc17435
 
 
a911ba5
 
 
 
 
 
 
 
 
 
 
 
 
 
dc17435
92d0377
98e82be
 
dc17435
80d0b8a
dc17435
98e82be
3b4df89
2a3fae3
80d0b8a
3696c1f
 
764d4f7
80d0b8a
d2d0219
3696c1f
dc17435
3696c1f
 
764d4f7
80d0b8a
764d4f7
3696c1f
 
2a3fae3
80d0b8a
2a3fae3
798ae00
3696c1f
764d4f7
2a3fae3
 
3696c1f
98e82be
 
a911ba5
98e82be
a911ba5
98e82be
a911ba5
98e82be
a911ba5
dc17435
3696c1f
dc17435
798ae00
3696c1f
dc17435
3696c1f
798ae00
92d0377
3696c1f
98e82be
80d0b8a
98e82be
764d4f7
92d0377
dc17435
 
 
 
d2d0219
92d0377
dc17435
 
 
 
d2d0219
92d0377
dc17435
 
 
 
 
 
 
 
b7db40a
92d0377
dc17435
 
53425a8
9fd7d89
798ae00
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import os
import io
import logging
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
from PyPDF2 import PdfReader
from docx import Document
from pptx import Presentation
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize Flask app
app = Flask(__name__)

# Set Hugging Face cache directory
os.environ["HF_HOME"] = "/app/hf_cache"

# Load T5 model and tokenizer
logger.info("Loading T5-Base model...")
try:
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    logger.info("T5-Base model loaded successfully.")
except Exception as e:
    logger.error(f"Failed to load T5-Base: {str(e)}")
    raise

ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}

def allowed_file(filename):
    """Check if the uploaded file has an allowed extension."""
    return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS

def summarize_text(text, max_length=150, min_length=30):
    """Summarize text using T5-Base."""
    try:
        if not text.strip():
            return "No text found in the document to summarize."

        input_text = "summarize: " + text
        inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
        summary_ids = model.generate(
            inputs["input_ids"],
            max_length=max_length,
            min_length=min_length,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True
        )
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary
    except Exception as e:
        logger.error(f"Error in T5 summarization: {str(e)}")
        return "Error summarizing text."

@app.route("/", methods=["GET"])
def index():
    """Root endpoint."""
    logger.info("Root endpoint accessed.")
    return "Document Summarizer API with T5-Base is running! Use /summarize endpoint for POST requests."

@app.route("/summarize", methods=["POST"])
def summarize():
    logger.info("Summarize endpoint called.")

    # Check if a file is in the request
    if "file" not in request.files:
        logger.error("No file uploaded.")
        return jsonify({"error": "No file uploaded"}), 400

    file = request.files["file"]

    # Check if file is empty
    if file.filename == "":
        logger.error("No file selected.")
        return jsonify({"error": "No selected file"}), 400

    # Check if file has an allowed extension
    if not allowed_file(file.filename):
        logger.error(f"Unsupported file format: {file.filename}")
        return jsonify({"error": "Unsupported file format"}), 400

    # Process the file
    filename = secure_filename(file.filename)
    file_content = file.read()
    file_ext = filename.rsplit(".", 1)[1].lower()

    try:
        if file_ext == "pdf":
            text = summarize_pdf(file_content)
        elif file_ext == "docx":
            text = summarize_docx(file_content)
        elif file_ext == "pptx":
            text = summarize_pptx(file_content)
        elif file_ext == "txt":
            text = summarize_txt(file_content)
        else:
            logger.error("Unsupported file format received.")
            return jsonify({"error": "Unsupported file format"}), 400

        # Generate summary
        summary = summarize_text(text)

        logger.info(f"File {filename} summarized successfully.")
        return jsonify({"filename": filename, "summary": summary})

    except Exception as e:
        logger.error(f"Error processing file {filename}: {str(e)}")
        return jsonify({"error": f"Error processing file: {str(e)}"}), 500

def summarize_pdf(file_content):
    """Extract text from PDF."""
    reader = PdfReader(io.BytesIO(file_content))
    text = "\n".join([page.extract_text() or "" for page in reader.pages])
    return text.strip()

def summarize_docx(file_content):
    """Extract text from DOCX."""
    doc = Document(io.BytesIO(file_content))
    text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
    return text.strip()

def summarize_pptx(file_content):
    """Extract text from PPTX."""
    ppt = Presentation(io.BytesIO(file_content))
    text = []
    for slide in ppt.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text") and shape.text.strip():
                text.append(shape.text.strip())
    return "\n".join(text).strip()

def summarize_txt(file_content):
    """Extract text from TXT file."""
    return file_content.decode("utf-8").strip()

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860, debug=True)