Spaces:
Sleeping
Sleeping
import os | |
import io | |
import logging | |
from flask import Flask, request, jsonify | |
from werkzeug.utils import secure_filename | |
from PyPDF2 import PdfReader | |
from docx import Document | |
from pptx import Presentation | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize, sent_tokenize | |
# Configure logging for debugging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
app = Flask(__name__) | |
# Use the NLTK data path set by the Dockerfile | |
nltk.data.path.append(os.getenv("NLTK_DATA", "/app/nltk_data")) | |
# Verify NLTK data is accessible (optional, for debugging) | |
try: | |
nltk.data.find('tokenizers/punkt') | |
nltk.data.find('corpora/stopwords') | |
logger.info("NLTK data loaded successfully.") | |
except LookupError: | |
logger.error("NLTK data not found. Check Dockerfile setup.") | |
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"} | |
def allowed_file(filename): | |
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS | |
def extractive_summary(text, num_sentences=5): | |
stop_words = set(stopwords.words('english')) | |
words = word_tokenize(text) | |
sentences = sent_tokenize(text) | |
if len(sentences) <= num_sentences: | |
return text | |
freq_table = {} | |
for word in words: | |
word = word.lower() | |
if word not in stop_words and word.isalnum(): | |
freq_table[word] = freq_table.get(word, 0) + 1 | |
sentence_scores = {} | |
for sentence in sentences: | |
for word, freq in freq_table.items(): | |
if word in sentence.lower(): | |
sentence_scores[sentence] = sentence_scores.get(sentence, 0) + freq | |
summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences] | |
return ' '.join(summary_sentences) | |
def index(): | |
logger.info("Root endpoint accessed.") | |
return "Document Summarizer API is running! Use /summarize endpoint for POST requests." | |
def summarize(): | |
logger.info("Summarize endpoint called.") | |
if "file" not in request.files: | |
logger.error("No file uploaded.") | |
return jsonify({"error": "No file uploaded"}), 400 | |
file = request.files["file"] | |
if file.filename == "": | |
logger.error("No file selected.") | |
return jsonify({"error": "No selected file"}), 400 | |
if not allowed_file(file.filename): | |
logger.error(f"Unsupported file format: {file.filename}") | |
return jsonify({"error": "Unsupported file format"}), 400 | |
filename = secure_filename(file.filename) | |
file_content = file.read() | |
file_ext = filename.rsplit(".", 1)[1].lower() | |
try: | |
if file_ext == "pdf": | |
summary = summarize_pdf(file_content) | |
elif file_ext == "docx": | |
summary = summarize_docx(file_content) | |
elif file_ext == "pptx": | |
summary = summarize_pptx(file_content) | |
elif file_ext == "txt": | |
summary = summarize_txt(file_content) | |
logger.info(f"File {filename} summarized successfully.") | |
return jsonify({"filename": filename, "summary": summary}) | |
except Exception as e: | |
logger.error(f"Error processing file {filename}: {str(e)}") | |
return jsonify({"error": f"Error processing file: {str(e)}"}), 500 | |
def summarize_pdf(file_content): | |
reader = PdfReader(io.BytesIO(file_content)) | |
text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()]) | |
return extractive_summary(text, num_sentences=5) | |
def summarize_docx(file_content): | |
doc = Document(io.BytesIO(file_content)) | |
text = "\n".join([para.text for para in doc.paragraphs]) | |
return extractive_summary(text, num_sentences=5) | |
def summarize_pptx(file_content): | |
ppt = Presentation(io.BytesIO(file_content)) | |
text = [] | |
for slide in ppt.slides: | |
for shape in slide.shapes: | |
if hasattr(shape, "text"): | |
text.append(shape.text) | |
full_text = "\n".join(text) | |
return extractive_summary(full_text, num_sentences=5) | |
def summarize_txt(file_content): | |
text = file_content.decode("utf-8") | |
return extractive_summary(text, num_sentences=5) | |
if __name__ == "__main__": | |
app.run(host="0.0.0.0", port=7860, debug=True) |