mike23415's picture
Update app.py
92d0377 verified
raw
history blame
4.41 kB
import os
import io
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
from PyPDF2 import PdfReader
from docx import Document
from pptx import Presentation
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
app = Flask(__name__)
# Download NLTK data when the app starts
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
# Allowed file extensions
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
def allowed_file(filename):
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
# Extractive summarization function
def extractive_summary(text, num_sentences=5):
"""
Summarizes the given text by selecting the top N most important sentences.
Args:
text (str): The text to summarize.
num_sentences (int): Number of sentences to include in the summary (default: 5).
Returns:
str: The summarized text.
"""
# Get stop words (e.g., "the", "is") to ignore them
stop_words = set(stopwords.words('english'))
# Tokenize text into words and sentences
words = word_tokenize(text)
sentences = sent_tokenize(text)
# If the text has fewer sentences than requested, return the full text
if len(sentences) <= num_sentences:
return text
# Calculate word frequencies, excluding stop words and non-alphanumeric characters
freq_table = {}
for word in words:
word = word.lower()
if word not in stop_words and word.isalnum():
freq_table[word] = freq_table.get(word, 0) + 1
# Score sentences based on the frequency of their words
sentence_scores = {}
for sentence in sentences:
for word, freq in freq_table.items():
if word in sentence.lower():
sentence_scores[sentence] = sentence_scores.get(sentence, 0) + freq
# Select the top N sentences with the highest scores
summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
summary = ' '.join(summary_sentences)
return summary
@app.route("/", methods=["GET"])
def index():
return "Document Summarizer API is running! Use /summarize endpoint for POST requests."
@app.route("/summarize", methods=["POST"])
def summarize():
if "file" not in request.files:
return jsonify({"error": "No file uploaded"}), 400
file = request.files["file"]
if file.filename == "":
return jsonify({"error": "No selected file"}), 400
if not allowed_file(file.filename):
return jsonify({"error": "Unsupported file format"}), 400
filename = secure_filename(file.filename)
file_content = file.read()
# Process file based on type
summary = None
file_ext = filename.rsplit(".", 1)[1].lower()
try:
if file_ext == "pdf":
summary = summarize_pdf(file_content)
elif file_ext == "docx":
summary = summarize_docx(file_content)
elif file_ext == "pptx":
summary = summarize_pptx(file_content)
elif file_ext == "txt":
summary = summarize_txt(file_content)
return jsonify({"filename": filename, "summary": summary})
except Exception as e:
return jsonify({"error": f"Error processing file: {str(e)}"}), 500
# Summarization functions
def summarize_pdf(file_content):
reader = PdfReader(io.BytesIO(file_content))
text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
return extractive_summary(text, num_sentences=5)
def summarize_docx(file_content):
doc = Document(io.BytesIO(file_content))
text = "\n".join([para.text for para in doc.paragraphs])
return extractive_summary(text, num_sentences=5)
def summarize_pptx(file_content):
ppt = Presentation(io.BytesIO(file_content))
text = []
for slide in ppt.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text.append(shape.text)
full_text = "\n".join(text)
return extractive_summary(full_text, num_sentences=5)
def summarize_txt(file_content):
text = file_content.decode("utf-8")
return extractive_summary(text, num_sentences=5)
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860, debug=True)