mike23415's picture
Update app.py
764d4f7 verified
raw
history blame
3.19 kB
import os
import fitz # PyMuPDF for PDF
import pytesseract
from PIL import Image
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
from transformers import T5Tokenizer, T5ForConditionalGeneration
from pptx import Presentation
from docx import Document
app = Flask(__name__)
app.config["UPLOAD_FOLDER"] = "uploads"
os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)
# Load T5 model
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
# Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = "\n".join([page.get_text("text") for page in doc])
return text.strip()
# Function to extract text from PowerPoint files
def extract_text_from_pptx(pptx_path):
presentation = Presentation(pptx_path)
text = "\n".join([shape.text for slide in presentation.slides for shape in slide.shapes if hasattr(shape, "text")])
return text.strip()
# Function to extract text from Word documents
def extract_text_from_docx(docx_path):
document = Document(docx_path)
text = "\n".join([paragraph.text for paragraph in document.paragraphs])
return text.strip()
# Function to extract text from images using OCR
def extract_text_from_image(image_path):
img = Image.open(image_path)
text = pytesseract.image_to_string(img)
return text.strip()
# Summarization function
def summarize_text(input_text):
input_ids = tokenizer.encode("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True)
output_ids = model.generate(input_ids, max_length=100, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
return tokenizer.decode(output_ids[0], skip_special_tokens=True)
# API for file upload and summarization
@app.route("/summarize", methods=["POST"])
def summarize_file():
if "file" not in request.files:
return jsonify({"error": "No file uploaded"}), 400
file = request.files["file"]
if file.filename == "":
return jsonify({"error": "No selected file"}), 400
filename = secure_filename(file.filename)
file_path = os.path.join(app.config["UPLOAD_FOLDER"], filename)
file.save(file_path)
try:
# Determine file type and extract text
if filename.endswith(".pdf"):
text = extract_text_from_pdf(file_path)
elif filename.endswith(".pptx"):
text = extract_text_from_pptx(file_path)
elif filename.endswith(".docx"):
text = extract_text_from_docx(file_path)
elif filename.lower().endswith((".png", ".jpg", ".jpeg")):
text = extract_text_from_image(file_path)
else:
return jsonify({"error": "Unsupported file type"}), 400
if not text:
return jsonify({"error": "No text found in the file"}), 400
summary = summarize_text(text)
return jsonify({"summary": summary})
except Exception as e:
return jsonify({"error": str(e)}), 500
finally:
os.remove(file_path) # Clean up
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860)