Spaces:
Sleeping
Sleeping
File size: 4,754 Bytes
764d4f7 2a3fae3 80d0b8a b7db40a 764d4f7 2a3fae3 764d4f7 2a3fae3 a911ba5 524f780 a911ba5 80d0b8a dc17435 764d4f7 b7db40a dc17435 012dc5b a911ba5 012dc5b dc17435 012dc5b d2d0219 2a3fae3 d2d0219 2a3fae3 798ae00 2a3fae3 d2d0219 a911ba5 dc17435 a911ba5 dc17435 92d0377 98e82be dc17435 80d0b8a dc17435 98e82be 3b4df89 2a3fae3 80d0b8a 3696c1f 764d4f7 80d0b8a d2d0219 3696c1f dc17435 3696c1f 764d4f7 80d0b8a 764d4f7 3696c1f 2a3fae3 80d0b8a 2a3fae3 798ae00 3696c1f 764d4f7 2a3fae3 3696c1f 98e82be a911ba5 98e82be a911ba5 98e82be a911ba5 98e82be a911ba5 dc17435 3696c1f dc17435 798ae00 3696c1f dc17435 3696c1f 798ae00 92d0377 3696c1f 98e82be 80d0b8a 98e82be 764d4f7 92d0377 dc17435 d2d0219 92d0377 dc17435 d2d0219 92d0377 dc17435 b7db40a 92d0377 dc17435 53425a8 9fd7d89 798ae00 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import os
import io
import logging
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
from PyPDF2 import PdfReader
from docx import Document
from pptx import Presentation
from transformers import T5Tokenizer, T5ForConditionalGeneration
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Initialize Flask app
app = Flask(__name__)
# Set Hugging Face cache directory
os.environ["HF_HOME"] = "/app/hf_cache"
# Load T5 model and tokenizer
logger.info("Loading T5-Base model...")
try:
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")
logger.info("T5-Base model loaded successfully.")
except Exception as e:
logger.error(f"Failed to load T5-Base: {str(e)}")
raise
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
def allowed_file(filename):
"""Check if the uploaded file has an allowed extension."""
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
def summarize_text(text, max_length=150, min_length=30):
"""Summarize text using T5-Base."""
try:
if not text.strip():
return "No text found in the document to summarize."
input_text = "summarize: " + text
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
summary_ids = model.generate(
inputs["input_ids"],
max_length=max_length,
min_length=min_length,
length_penalty=2.0,
num_beams=4,
early_stopping=True
)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
except Exception as e:
logger.error(f"Error in T5 summarization: {str(e)}")
return "Error summarizing text."
@app.route("/", methods=["GET"])
def index():
"""Root endpoint."""
logger.info("Root endpoint accessed.")
return "Document Summarizer API with T5-Base is running! Use /summarize endpoint for POST requests."
@app.route("/summarize", methods=["POST"])
def summarize():
logger.info("Summarize endpoint called.")
# Check if a file is in the request
if "file" not in request.files:
logger.error("No file uploaded.")
return jsonify({"error": "No file uploaded"}), 400
file = request.files["file"]
# Check if file is empty
if file.filename == "":
logger.error("No file selected.")
return jsonify({"error": "No selected file"}), 400
# Check if file has an allowed extension
if not allowed_file(file.filename):
logger.error(f"Unsupported file format: {file.filename}")
return jsonify({"error": "Unsupported file format"}), 400
# Process the file
filename = secure_filename(file.filename)
file_content = file.read()
file_ext = filename.rsplit(".", 1)[1].lower()
try:
if file_ext == "pdf":
text = summarize_pdf(file_content)
elif file_ext == "docx":
text = summarize_docx(file_content)
elif file_ext == "pptx":
text = summarize_pptx(file_content)
elif file_ext == "txt":
text = summarize_txt(file_content)
else:
logger.error("Unsupported file format received.")
return jsonify({"error": "Unsupported file format"}), 400
# Generate summary
summary = summarize_text(text)
logger.info(f"File {filename} summarized successfully.")
return jsonify({"filename": filename, "summary": summary})
except Exception as e:
logger.error(f"Error processing file {filename}: {str(e)}")
return jsonify({"error": f"Error processing file: {str(e)}"}), 500
def summarize_pdf(file_content):
"""Extract text from PDF."""
reader = PdfReader(io.BytesIO(file_content))
text = "\n".join([page.extract_text() or "" for page in reader.pages])
return text.strip()
def summarize_docx(file_content):
"""Extract text from DOCX."""
doc = Document(io.BytesIO(file_content))
text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
return text.strip()
def summarize_pptx(file_content):
"""Extract text from PPTX."""
ppt = Presentation(io.BytesIO(file_content))
text = []
for slide in ppt.slides:
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
text.append(shape.text.strip())
return "\n".join(text).strip()
def summarize_txt(file_content):
"""Extract text from TXT file."""
return file_content.decode("utf-8").strip()
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860, debug=True)
|