Spaces:
Sleeping
Sleeping
File size: 4,266 Bytes
764d4f7 2a3fae3 80d0b8a b7db40a 764d4f7 2a3fae3 764d4f7 2a3fae3 b7db40a 44d7238 92d0377 524f780 80d0b8a 764d4f7 b7db40a 80d0b8a d2d0219 2a3fae3 d2d0219 2a3fae3 d2d0219 92d0377 80d0b8a 92d0377 98e82be 80d0b8a 98e82be 3b4df89 2a3fae3 80d0b8a 764d4f7 80d0b8a d2d0219 764d4f7 80d0b8a 764d4f7 2a3fae3 80d0b8a 2a3fae3 764d4f7 2a3fae3 98e82be 92d0377 98e82be 92d0377 98e82be 92d0377 98e82be 92d0377 80d0b8a 92d0377 98e82be 80d0b8a 98e82be 764d4f7 92d0377 2a3fae3 92d0377 d2d0219 92d0377 2a3fae3 92d0377 d2d0219 92d0377 2a3fae3 98e82be 92d0377 98e82be 92d0377 b7db40a 92d0377 53425a8 9fd7d89 92d0377 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import os
import io
import logging
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
from PyPDF2 import PdfReader
from docx import Document
from pptx import Presentation
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
# Configure logging for debugging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = Flask(__name__)
# Use the NLTK data path set by the Dockerfile
nltk.data.path.append(os.getenv("NLTK_DATA", "/app/nltk_data"))
# Verify NLTK data is accessible (optional, for debugging)
try:
nltk.data.find('tokenizers/punkt')
nltk.data.find('corpora/stopwords')
logger.info("NLTK data loaded successfully.")
except LookupError:
logger.error("NLTK data not found. Check Dockerfile setup.")
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
def allowed_file(filename):
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
def extractive_summary(text, num_sentences=5):
stop_words = set(stopwords.words('english'))
words = word_tokenize(text)
sentences = sent_tokenize(text)
if len(sentences) <= num_sentences:
return text
freq_table = {}
for word in words:
word = word.lower()
if word not in stop_words and word.isalnum():
freq_table[word] = freq_table.get(word, 0) + 1
sentence_scores = {}
for sentence in sentences:
for word, freq in freq_table.items():
if word in sentence.lower():
sentence_scores[sentence] = sentence_scores.get(sentence, 0) + freq
summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
return ' '.join(summary_sentences)
@app.route("/", methods=["GET"])
def index():
logger.info("Root endpoint accessed.")
return "Document Summarizer API is running! Use /summarize endpoint for POST requests."
@app.route("/summarize", methods=["POST"])
def summarize():
logger.info("Summarize endpoint called.")
if "file" not in request.files:
logger.error("No file uploaded.")
return jsonify({"error": "No file uploaded"}), 400
file = request.files["file"]
if file.filename == "":
logger.error("No file selected.")
return jsonify({"error": "No selected file"}), 400
if not allowed_file(file.filename):
logger.error(f"Unsupported file format: {file.filename}")
return jsonify({"error": "Unsupported file format"}), 400
filename = secure_filename(file.filename)
file_content = file.read()
file_ext = filename.rsplit(".", 1)[1].lower()
try:
if file_ext == "pdf":
summary = summarize_pdf(file_content)
elif file_ext == "docx":
summary = summarize_docx(file_content)
elif file_ext == "pptx":
summary = summarize_pptx(file_content)
elif file_ext == "txt":
summary = summarize_txt(file_content)
logger.info(f"File {filename} summarized successfully.")
return jsonify({"filename": filename, "summary": summary})
except Exception as e:
logger.error(f"Error processing file {filename}: {str(e)}")
return jsonify({"error": f"Error processing file: {str(e)}"}), 500
def summarize_pdf(file_content):
reader = PdfReader(io.BytesIO(file_content))
text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
return extractive_summary(text, num_sentences=5)
def summarize_docx(file_content):
doc = Document(io.BytesIO(file_content))
text = "\n".join([para.text for para in doc.paragraphs])
return extractive_summary(text, num_sentences=5)
def summarize_pptx(file_content):
ppt = Presentation(io.BytesIO(file_content))
text = []
for slide in ppt.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text.append(shape.text)
full_text = "\n".join(text)
return extractive_summary(full_text, num_sentences=5)
def summarize_txt(file_content):
text = file_content.decode("utf-8")
return extractive_summary(text, num_sentences=5)
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860, debug=True) |