Spaces:
Sleeping
Sleeping
import os | |
import fitz # PyMuPDF for PDF | |
import pytesseract | |
from PIL import Image | |
from flask import Flask, request, jsonify | |
from werkzeug.utils import secure_filename | |
from transformers import T5Tokenizer, T5ForConditionalGeneration | |
from pptx import Presentation | |
from docx import Document | |
app = Flask(__name__) | |
app.config["UPLOAD_FOLDER"] = "uploads" | |
os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True) | |
# Load T5 model | |
model_name = "t5-base" | |
tokenizer = T5Tokenizer.from_pretrained(model_name) | |
model = T5ForConditionalGeneration.from_pretrained(model_name) | |
# Function to extract text from PDFs | |
def extract_text_from_pdf(pdf_path): | |
doc = fitz.open(pdf_path) | |
text = "\n".join([page.get_text("text") for page in doc]) | |
return text.strip() | |
# Function to extract text from PowerPoint files | |
def extract_text_from_pptx(pptx_path): | |
presentation = Presentation(pptx_path) | |
text = "\n".join([shape.text for slide in presentation.slides for shape in slide.shapes if hasattr(shape, "text")]) | |
return text.strip() | |
# Function to extract text from Word documents | |
def extract_text_from_docx(docx_path): | |
document = Document(docx_path) | |
text = "\n".join([paragraph.text for paragraph in document.paragraphs]) | |
return text.strip() | |
# Function to extract text from images using OCR | |
def extract_text_from_image(image_path): | |
img = Image.open(image_path) | |
text = pytesseract.image_to_string(img) | |
return text.strip() | |
# Summarization function | |
def summarize_text(input_text): | |
input_ids = tokenizer.encode("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True) | |
output_ids = model.generate(input_ids, max_length=100, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True) | |
return tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
# API for file upload and summarization | |
def summarize_file(): | |
if "file" not in request.files: | |
return jsonify({"error": "No file uploaded"}), 400 | |
file = request.files["file"] | |
if file.filename == "": | |
return jsonify({"error": "No selected file"}), 400 | |
filename = secure_filename(file.filename) | |
file_path = os.path.join(app.config["UPLOAD_FOLDER"], filename) | |
file.save(file_path) | |
try: | |
# Determine file type and extract text | |
if filename.endswith(".pdf"): | |
text = extract_text_from_pdf(file_path) | |
elif filename.endswith(".pptx"): | |
text = extract_text_from_pptx(file_path) | |
elif filename.endswith(".docx"): | |
text = extract_text_from_docx(file_path) | |
elif filename.lower().endswith((".png", ".jpg", ".jpeg")): | |
text = extract_text_from_image(file_path) | |
else: | |
return jsonify({"error": "Unsupported file type"}), 400 | |
if not text: | |
return jsonify({"error": "No text found in the file"}), 400 | |
summary = summarize_text(text) | |
return jsonify({"summary": summary}) | |
except Exception as e: | |
return jsonify({"error": str(e)}), 500 | |
finally: | |
os.remove(file_path) # Clean up | |
if __name__ == "__main__": | |
app.run(host="0.0.0.0", port=7860) | |