Spaces:
Sleeping
Sleeping
File size: 4,408 Bytes
764d4f7 2a3fae3 b7db40a 764d4f7 2a3fae3 764d4f7 2a3fae3 b7db40a 44d7238 92d0377 524f780 764d4f7 b7db40a 92d0377 d2d0219 2a3fae3 d2d0219 2a3fae3 d2d0219 92d0377 98e82be 3b4df89 2a3fae3 764d4f7 d2d0219 98e82be 764d4f7 2a3fae3 764d4f7 98e82be 2a3fae3 98e82be 764d4f7 2a3fae3 98e82be 2a3fae3 92d0377 2a3fae3 98e82be 92d0377 98e82be 92d0377 98e82be 92d0377 98e82be 92d0377 98e82be 764d4f7 92d0377 2a3fae3 92d0377 d2d0219 92d0377 2a3fae3 92d0377 d2d0219 92d0377 2a3fae3 98e82be 92d0377 98e82be 92d0377 b7db40a 92d0377 53425a8 9fd7d89 92d0377 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import os
import io
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
from PyPDF2 import PdfReader
from docx import Document
from pptx import Presentation
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
app = Flask(__name__)
# Download NLTK data when the app starts
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
# Allowed file extensions
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
def allowed_file(filename):
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
# Extractive summarization function
def extractive_summary(text, num_sentences=5):
"""
Summarizes the given text by selecting the top N most important sentences.
Args:
text (str): The text to summarize.
num_sentences (int): Number of sentences to include in the summary (default: 5).
Returns:
str: The summarized text.
"""
# Get stop words (e.g., "the", "is") to ignore them
stop_words = set(stopwords.words('english'))
# Tokenize text into words and sentences
words = word_tokenize(text)
sentences = sent_tokenize(text)
# If the text has fewer sentences than requested, return the full text
if len(sentences) <= num_sentences:
return text
# Calculate word frequencies, excluding stop words and non-alphanumeric characters
freq_table = {}
for word in words:
word = word.lower()
if word not in stop_words and word.isalnum():
freq_table[word] = freq_table.get(word, 0) + 1
# Score sentences based on the frequency of their words
sentence_scores = {}
for sentence in sentences:
for word, freq in freq_table.items():
if word in sentence.lower():
sentence_scores[sentence] = sentence_scores.get(sentence, 0) + freq
# Select the top N sentences with the highest scores
summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
summary = ' '.join(summary_sentences)
return summary
@app.route("/", methods=["GET"])
def index():
return "Document Summarizer API is running! Use /summarize endpoint for POST requests."
@app.route("/summarize", methods=["POST"])
def summarize():
if "file" not in request.files:
return jsonify({"error": "No file uploaded"}), 400
file = request.files["file"]
if file.filename == "":
return jsonify({"error": "No selected file"}), 400
if not allowed_file(file.filename):
return jsonify({"error": "Unsupported file format"}), 400
filename = secure_filename(file.filename)
file_content = file.read()
# Process file based on type
summary = None
file_ext = filename.rsplit(".", 1)[1].lower()
try:
if file_ext == "pdf":
summary = summarize_pdf(file_content)
elif file_ext == "docx":
summary = summarize_docx(file_content)
elif file_ext == "pptx":
summary = summarize_pptx(file_content)
elif file_ext == "txt":
summary = summarize_txt(file_content)
return jsonify({"filename": filename, "summary": summary})
except Exception as e:
return jsonify({"error": f"Error processing file: {str(e)}"}), 500
# Summarization functions
def summarize_pdf(file_content):
reader = PdfReader(io.BytesIO(file_content))
text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
return extractive_summary(text, num_sentences=5)
def summarize_docx(file_content):
doc = Document(io.BytesIO(file_content))
text = "\n".join([para.text for para in doc.paragraphs])
return extractive_summary(text, num_sentences=5)
def summarize_pptx(file_content):
ppt = Presentation(io.BytesIO(file_content))
text = []
for slide in ppt.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text.append(shape.text)
full_text = "\n".join(text)
return extractive_summary(full_text, num_sentences=5)
def summarize_txt(file_content):
text = file_content.decode("utf-8")
return extractive_summary(text, num_sentences=5)
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860, debug=True) |