File size: 5,284 Bytes
764d4f7
2a3fae3
80d0b8a
b7db40a
764d4f7
2a3fae3
764d4f7
2a3fae3
a911ba5
524f780
a911ba5
80d0b8a
 
 
764d4f7
b7db40a
798ae00
 
012dc5b
 
a911ba5
012dc5b
798ae00
012dc5b
 
 
 
 
d2d0219
2a3fae3
d2d0219
2a3fae3
798ae00
2a3fae3
d2d0219
a911ba5
 
798ae00
 
 
a911ba5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
798ae00
92d0377
98e82be
 
798ae00
80d0b8a
798ae00
98e82be
3b4df89
2a3fae3
798ae00
80d0b8a
798ae00
764d4f7
80d0b8a
d2d0219
798ae00
764d4f7
798ae00
764d4f7
80d0b8a
764d4f7
798ae00
2a3fae3
80d0b8a
2a3fae3
798ae00
764d4f7
2a3fae3
 
798ae00
98e82be
 
a911ba5
98e82be
a911ba5
98e82be
a911ba5
98e82be
a911ba5
798ae00
a911ba5
798ae00
 
92d0377
798ae00
98e82be
80d0b8a
98e82be
764d4f7
92d0377
798ae00
 
 
 
 
 
 
 
d2d0219
92d0377
798ae00
 
 
 
 
 
 
 
d2d0219
92d0377
798ae00
 
 
 
 
 
 
 
 
 
 
 
b7db40a
92d0377
798ae00
 
 
 
 
53425a8
9fd7d89
798ae00
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import os
import io
import logging
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
from PyPDF2 import PdfReader
from docx import Document
from pptx import Presentation
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = Flask(__name__)

# Set Hugging Face cache to a writable directory
os.environ["HF_HOME"] = "/tmp/huggingface_cache"

# Load T5 model and tokenizer
logger.info("Loading T5-Base model...")
try:
    tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    logger.info("T5-Base model loaded successfully.")
except Exception as e:
    logger.error(f"Failed to load T5-Base: {str(e)}")
    raise

ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}

def allowed_file(filename):
    """Check if the uploaded file has an allowed extension."""
    return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS

def summarize_text(text, max_length=150, min_length=30):
    """Summarize text using T5-Base."""
    if not text.strip():
        return "No meaningful text found in the document."

    try:
        input_text = "summarize: " + text
        inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
        summary_ids = model.generate(
            inputs["input_ids"],
            max_length=max_length,
            min_length=min_length,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True
        )
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary
    except Exception as e:
        logger.error(f"Error in T5 summarization: {str(e)}")
        return "Error in summarization process."

@app.route("/", methods=["GET"])
def index():
    """Root endpoint for API status check."""
    logger.info("Root endpoint accessed.")
    return "Document Summarizer API with T5-Base is running! Use /summarize for POST requests."

@app.route("/summarize", methods=["POST"])
def summarize():
    """Handle file uploads and summarize the content."""
    logger.info("Summarize endpoint called.")

    if "file" not in request.files:
        logger.error("No file uploaded.")
        return jsonify({"error": "No file uploaded"}), 400

    file = request.files["file"]
    
    if file.filename == "":
        logger.error("No file selected.")
        return jsonify({"error": "No selected file"}), 400

    if not allowed_file(file.filename):
        logger.error(f"Unsupported file format: {file.filename}")
        return jsonify({"error": "Unsupported file format"}), 400

    filename = secure_filename(file.filename)
    file_content = file.read()
    file_ext = filename.rsplit(".", 1)[1].lower()

    try:
        if file_ext == "pdf":
            text = summarize_pdf(file_content)
        elif file_ext == "docx":
            text = summarize_docx(file_content)
        elif file_ext == "pptx":
            text = summarize_pptx(file_content)
        elif file_ext == "txt":
            text = summarize_txt(file_content)

        summary = summarize_text(text)

        logger.info(f"File {filename} summarized successfully.")
        return jsonify({"filename": filename, "summary": summary})

    except Exception as e:
        logger.error(f"Error processing file {filename}: {str(e)}")
        return jsonify({"error": f"Error processing file: {str(e)}"}), 500

def summarize_pdf(file_content):
    """Extract text from a PDF file."""
    try:
        reader = PdfReader(io.BytesIO(file_content))
        text = "\n".join([page.extract_text() or "" for page in reader.pages])
        return text.strip() or "No extractable text found in PDF."
    except Exception as e:
        logger.error(f"Error reading PDF: {str(e)}")
        return "Error extracting text from PDF."

def summarize_docx(file_content):
    """Extract text from a DOCX file."""
    try:
        doc = Document(io.BytesIO(file_content))
        text = "\n".join([para.text for para in doc.paragraphs])
        return text.strip() or "No extractable text found in DOCX."
    except Exception as e:
        logger.error(f"Error reading DOCX: {str(e)}")
        return "Error extracting text from DOCX."

def summarize_pptx(file_content):
    """Extract text from a PPTX file."""
    try:
        ppt = Presentation(io.BytesIO(file_content))
        text = []
        for slide in ppt.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text.append(shape.text)
        return "\n".join(text).strip() or "No extractable text found in PPTX."
    except Exception as e:
        logger.error(f"Error reading PPTX: {str(e)}")
        return "Error extracting text from PPTX."

def summarize_txt(file_content):
    """Extract text from a TXT file with safe decoding."""
    try:
        return file_content.decode("utf-8").strip() or "No extractable text found in TXT."
    except UnicodeDecodeError:
        return file_content.decode("latin-1").strip() or "No extractable text found in TXT."

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860, debug=True)