Spaces:

mike23415
/

playwebit-t5-api

Sleeping

App Files Files Community

playwebit-t5-api / app.py

mike23415

Update app.py

764d4f7 verified about 1 month ago

raw

history blame

3.19 kB

	import os
	import fitz # PyMuPDF for PDF
	import pytesseract
	from PIL import Image
	from flask import Flask, request, jsonify
	from werkzeug.utils import secure_filename
	from transformers import T5Tokenizer, T5ForConditionalGeneration
	from pptx import Presentation
	from docx import Document

	app = Flask(__name__)
	app.config["UPLOAD_FOLDER"] = "uploads"
	os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)

	# Load T5 model
	model_name = "t5-base"
	tokenizer = T5Tokenizer.from_pretrained(model_name)
	model = T5ForConditionalGeneration.from_pretrained(model_name)

	# Function to extract text from PDFs
	def extract_text_from_pdf(pdf_path):
	doc = fitz.open(pdf_path)
	text = "\n".join([page.get_text("text") for page in doc])
	return text.strip()

	# Function to extract text from PowerPoint files
	def extract_text_from_pptx(pptx_path):
	presentation = Presentation(pptx_path)
	text = "\n".join([shape.text for slide in presentation.slides for shape in slide.shapes if hasattr(shape, "text")])
	return text.strip()

	# Function to extract text from Word documents
	def extract_text_from_docx(docx_path):
	document = Document(docx_path)
	text = "\n".join([paragraph.text for paragraph in document.paragraphs])
	return text.strip()

	# Function to extract text from images using OCR
	def extract_text_from_image(image_path):
	img = Image.open(image_path)
	text = pytesseract.image_to_string(img)
	return text.strip()

	# Summarization function
	def summarize_text(input_text):
	input_ids = tokenizer.encode("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True)
	output_ids = model.generate(input_ids, max_length=100, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
	return tokenizer.decode(output_ids[0], skip_special_tokens=True)

	# API for file upload and summarization
	@app.route("/summarize", methods=["POST"])
	def summarize_file():
	if "file" not in request.files:
	return jsonify({"error": "No file uploaded"}), 400

	file = request.files["file"]
	if file.filename == "":
	return jsonify({"error": "No selected file"}), 400

	filename = secure_filename(file.filename)
	file_path = os.path.join(app.config["UPLOAD_FOLDER"], filename)
	file.save(file_path)

	try:
	# Determine file type and extract text
	if filename.endswith(".pdf"):
	text = extract_text_from_pdf(file_path)
	elif filename.endswith(".pptx"):
	text = extract_text_from_pptx(file_path)
	elif filename.endswith(".docx"):
	text = extract_text_from_docx(file_path)
	elif filename.lower().endswith((".png", ".jpg", ".jpeg")):
	text = extract_text_from_image(file_path)
	else:
	return jsonify({"error": "Unsupported file type"}), 400

	if not text:
	return jsonify({"error": "No text found in the file"}), 400

	summary = summarize_text(text)
	return jsonify({"summary": summary})

	except Exception as e:
	return jsonify({"error": str(e)}), 500
	finally:
	os.remove(file_path) # Clean up

	if __name__ == "__main__":
	app.run(host="0.0.0.0", port=7860)