Spaces:

mike23415
/

playwebit-t5-api

Sleeping

App Files Files Community

playwebit-t5-api / app.py

mike23415

Update app.py

92d0377 verified about 1 month ago

raw

history blame

4.41 kB

	import os
	import io
	from flask import Flask, request, jsonify
	from werkzeug.utils import secure_filename
	from PyPDF2 import PdfReader
	from docx import Document
	from pptx import Presentation
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize, sent_tokenize

	app = Flask(__name__)

	# Download NLTK data when the app starts
	nltk.download('punkt', quiet=True)
	nltk.download('stopwords', quiet=True)

	# Allowed file extensions
	ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}

	def allowed_file(filename):
	return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS

	# Extractive summarization function
	def extractive_summary(text, num_sentences=5):
	"""
	Summarizes the given text by selecting the top N most important sentences.

	Args:
	text (str): The text to summarize.
	num_sentences (int): Number of sentences to include in the summary (default: 5).

	Returns:
	str: The summarized text.
	"""
	# Get stop words (e.g., "the", "is") to ignore them
	stop_words = set(stopwords.words('english'))

	# Tokenize text into words and sentences
	words = word_tokenize(text)
	sentences = sent_tokenize(text)

	# If the text has fewer sentences than requested, return the full text
	if len(sentences) <= num_sentences:
	return text

	# Calculate word frequencies, excluding stop words and non-alphanumeric characters
	freq_table = {}
	for word in words:
	word = word.lower()
	if word not in stop_words and word.isalnum():
	freq_table[word] = freq_table.get(word, 0) + 1

	# Score sentences based on the frequency of their words
	sentence_scores = {}
	for sentence in sentences:
	for word, freq in freq_table.items():
	if word in sentence.lower():
	sentence_scores[sentence] = sentence_scores.get(sentence, 0) + freq

	# Select the top N sentences with the highest scores
	summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
	summary = ' '.join(summary_sentences)
	return summary

	@app.route("/", methods=["GET"])
	def index():
	return "Document Summarizer API is running! Use /summarize endpoint for POST requests."

	@app.route("/summarize", methods=["POST"])
	def summarize():
	if "file" not in request.files:
	return jsonify({"error": "No file uploaded"}), 400

	file = request.files["file"]

	if file.filename == "":
	return jsonify({"error": "No selected file"}), 400

	if not allowed_file(file.filename):
	return jsonify({"error": "Unsupported file format"}), 400

	filename = secure_filename(file.filename)
	file_content = file.read()

	# Process file based on type
	summary = None
	file_ext = filename.rsplit(".", 1)[1].lower()

	try:
	if file_ext == "pdf":
	summary = summarize_pdf(file_content)
	elif file_ext == "docx":
	summary = summarize_docx(file_content)
	elif file_ext == "pptx":
	summary = summarize_pptx(file_content)
	elif file_ext == "txt":
	summary = summarize_txt(file_content)

	return jsonify({"filename": filename, "summary": summary})
	except Exception as e:
	return jsonify({"error": f"Error processing file: {str(e)}"}), 500

	# Summarization functions
	def summarize_pdf(file_content):
	reader = PdfReader(io.BytesIO(file_content))
	text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
	return extractive_summary(text, num_sentences=5)

	def summarize_docx(file_content):
	doc = Document(io.BytesIO(file_content))
	text = "\n".join([para.text for para in doc.paragraphs])
	return extractive_summary(text, num_sentences=5)

	def summarize_pptx(file_content):
	ppt = Presentation(io.BytesIO(file_content))
	text = []
	for slide in ppt.slides:
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	text.append(shape.text)
	full_text = "\n".join(text)
	return extractive_summary(full_text, num_sentences=5)

	def summarize_txt(file_content):
	text = file_content.decode("utf-8")
	return extractive_summary(text, num_sentences=5)

	if __name__ == "__main__":
	app.run(host="0.0.0.0", port=7860, debug=True)