Spaces:
Sleeping
Sleeping
File size: 5,284 Bytes
764d4f7 2a3fae3 80d0b8a b7db40a 764d4f7 2a3fae3 764d4f7 2a3fae3 a911ba5 524f780 a911ba5 80d0b8a 764d4f7 b7db40a 798ae00 012dc5b a911ba5 012dc5b 798ae00 012dc5b d2d0219 2a3fae3 d2d0219 2a3fae3 798ae00 2a3fae3 d2d0219 a911ba5 798ae00 a911ba5 798ae00 92d0377 98e82be 798ae00 80d0b8a 798ae00 98e82be 3b4df89 2a3fae3 798ae00 80d0b8a 798ae00 764d4f7 80d0b8a d2d0219 798ae00 764d4f7 798ae00 764d4f7 80d0b8a 764d4f7 798ae00 2a3fae3 80d0b8a 2a3fae3 798ae00 764d4f7 2a3fae3 798ae00 98e82be a911ba5 98e82be a911ba5 98e82be a911ba5 98e82be a911ba5 798ae00 a911ba5 798ae00 92d0377 798ae00 98e82be 80d0b8a 98e82be 764d4f7 92d0377 798ae00 d2d0219 92d0377 798ae00 d2d0219 92d0377 798ae00 b7db40a 92d0377 798ae00 53425a8 9fd7d89 798ae00 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import os
import io
import logging
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
from PyPDF2 import PdfReader
from docx import Document
from pptx import Presentation
from transformers import T5Tokenizer, T5ForConditionalGeneration
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = Flask(__name__)
# Set Hugging Face cache to a writable directory
os.environ["HF_HOME"] = "/tmp/huggingface_cache"
# Load T5 model and tokenizer
logger.info("Loading T5-Base model...")
try:
tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
model = T5ForConditionalGeneration.from_pretrained("t5-base")
logger.info("T5-Base model loaded successfully.")
except Exception as e:
logger.error(f"Failed to load T5-Base: {str(e)}")
raise
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
def allowed_file(filename):
"""Check if the uploaded file has an allowed extension."""
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
def summarize_text(text, max_length=150, min_length=30):
"""Summarize text using T5-Base."""
if not text.strip():
return "No meaningful text found in the document."
try:
input_text = "summarize: " + text
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
summary_ids = model.generate(
inputs["input_ids"],
max_length=max_length,
min_length=min_length,
length_penalty=2.0,
num_beams=4,
early_stopping=True
)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
except Exception as e:
logger.error(f"Error in T5 summarization: {str(e)}")
return "Error in summarization process."
@app.route("/", methods=["GET"])
def index():
"""Root endpoint for API status check."""
logger.info("Root endpoint accessed.")
return "Document Summarizer API with T5-Base is running! Use /summarize for POST requests."
@app.route("/summarize", methods=["POST"])
def summarize():
"""Handle file uploads and summarize the content."""
logger.info("Summarize endpoint called.")
if "file" not in request.files:
logger.error("No file uploaded.")
return jsonify({"error": "No file uploaded"}), 400
file = request.files["file"]
if file.filename == "":
logger.error("No file selected.")
return jsonify({"error": "No selected file"}), 400
if not allowed_file(file.filename):
logger.error(f"Unsupported file format: {file.filename}")
return jsonify({"error": "Unsupported file format"}), 400
filename = secure_filename(file.filename)
file_content = file.read()
file_ext = filename.rsplit(".", 1)[1].lower()
try:
if file_ext == "pdf":
text = summarize_pdf(file_content)
elif file_ext == "docx":
text = summarize_docx(file_content)
elif file_ext == "pptx":
text = summarize_pptx(file_content)
elif file_ext == "txt":
text = summarize_txt(file_content)
summary = summarize_text(text)
logger.info(f"File {filename} summarized successfully.")
return jsonify({"filename": filename, "summary": summary})
except Exception as e:
logger.error(f"Error processing file {filename}: {str(e)}")
return jsonify({"error": f"Error processing file: {str(e)}"}), 500
def summarize_pdf(file_content):
"""Extract text from a PDF file."""
try:
reader = PdfReader(io.BytesIO(file_content))
text = "\n".join([page.extract_text() or "" for page in reader.pages])
return text.strip() or "No extractable text found in PDF."
except Exception as e:
logger.error(f"Error reading PDF: {str(e)}")
return "Error extracting text from PDF."
def summarize_docx(file_content):
"""Extract text from a DOCX file."""
try:
doc = Document(io.BytesIO(file_content))
text = "\n".join([para.text for para in doc.paragraphs])
return text.strip() or "No extractable text found in DOCX."
except Exception as e:
logger.error(f"Error reading DOCX: {str(e)}")
return "Error extracting text from DOCX."
def summarize_pptx(file_content):
"""Extract text from a PPTX file."""
try:
ppt = Presentation(io.BytesIO(file_content))
text = []
for slide in ppt.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text.append(shape.text)
return "\n".join(text).strip() or "No extractable text found in PPTX."
except Exception as e:
logger.error(f"Error reading PPTX: {str(e)}")
return "Error extracting text from PPTX."
def summarize_txt(file_content):
"""Extract text from a TXT file with safe decoding."""
try:
return file_content.decode("utf-8").strip() or "No extractable text found in TXT."
except UnicodeDecodeError:
return file_content.decode("latin-1").strip() or "No extractable text found in TXT."
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860, debug=True)
|