Spaces:
Sleeping
Sleeping
File size: 6,095 Bytes
764d4f7 2a3fae3 b7db40a 764d4f7 2a3fae3 764d4f7 2a3fae3 b7db40a 44d7238 524f780 764d4f7 b7db40a 44d7238 53425a8 44d7238 b7db40a 44d7238 d2d0219 2a3fae3 d2d0219 2a3fae3 d2d0219 98e82be 3b4df89 2a3fae3 764d4f7 d2d0219 98e82be 764d4f7 2a3fae3 764d4f7 98e82be 2a3fae3 98e82be 764d4f7 2a3fae3 98e82be 2a3fae3 b7db40a 2a3fae3 98e82be b7db40a 98e82be b7db40a 98e82be b7db40a 98e82be b7db40a 53425a8 44d7238 53425a8 b7db40a 53425a8 b7db40a 98e82be 764d4f7 53425a8 b7db40a 2a3fae3 b7db40a d2d0219 b7db40a 2a3fae3 b7db40a d2d0219 b7db40a 2a3fae3 98e82be b7db40a 98e82be b7db40a 3b4df89 b7db40a 44d7238 b7db40a 44d7238 3b4df89 53425a8 9fd7d89 44d7238 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
import os
import io
import re
from flask import Flask, request, jsonify
from flask_cors import CORS
from werkzeug.utils import secure_filename
from PyPDF2 import PdfReader
from docx import Document
from pptx import Presentation
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from heapq import nlargest
from collections import defaultdict
app = Flask(__name__)
CORS(app) # Enable CORS for all routes
# Set NLTK data path to a directory included in the project
nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
os.makedirs(nltk_data_dir, exist_ok=True)
nltk.data.path.append(nltk_data_dir)
# Ensure NLTK data is available (pre-downloaded)
try:
stopwords.words('english') # Test if stopwords are accessible
except LookupError:
print("NLTK data not found. Please ensure 'punkt' and 'stopwords' are pre-downloaded in 'nltk_data'.")
# Fallback will be used if this fails
# Allowed file extensions
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
def allowed_file(filename):
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
@app.route("/", methods=["GET"])
def index():
return "Document Summarizer API is running! Use /summarize endpoint for POST requests."
@app.route("/summarize", methods=["POST"])
def summarize():
if "file" not in request.files:
return jsonify({"error": "No file uploaded"}), 400
file = request.files["file"]
if file.filename == "":
return jsonify({"error": "No selected file"}), 400
if not allowed_file(file.filename):
return jsonify({"error": "Unsupported file format"}), 400
filename = secure_filename(file.filename)
file_content = file.read()
# Process file based on type
text = None
file_ext = filename.rsplit(".", 1)[1].lower()
try:
if file_ext == "pdf":
text = extract_text_from_pdf(file_content)
elif file_ext == "docx":
text = extract_text_from_docx(file_content)
elif file_ext == "pptx":
text = extract_text_from_pptx(file_content)
elif file_ext == "txt":
text = extract_text_from_txt(file_content)
# Generate a summary of the text
try:
summary = generate_summary(text)
except LookupError as e:
print(f"NLTK summarization failed: {e}. Using fallback.")
summary = simple_summarize(text)
except Exception as e:
print(f"Summarization error: {e}")
summary = text[:1000] + "..." if len(text) > 1000 else text
# Include metadata
word_count = len(text.split())
return jsonify({
"filename": filename,
"summary": summary,
"original_word_count": word_count,
"summary_word_count": len(summary.split()) if summary else 0
})
except Exception as e:
return jsonify({"error": f"Error processing file: {str(e)}"}), 500
# Text extraction functions
def extract_text_from_pdf(file_content):
reader = PdfReader(io.BytesIO(file_content))
text = ""
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n\n"
return clean_text(text)
def extract_text_from_docx(file_content):
doc = Document(io.BytesIO(file_content))
text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
return clean_text(text)
def extract_text_from_pptx(file_content):
ppt = Presentation(io.BytesIO(file_content))
text = []
for slide in ppt.slides:
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
text.append(shape.text)
return clean_text("\n".join(text))
def extract_text_from_txt(file_content):
text = file_content.decode("utf-8", errors="ignore")
return clean_text(text)
def clean_text(text):
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\w\s\.\,\!\?\:\;]', '', text)
return text.strip()
def generate_summary(text, sentence_count=5):
if len(text.split()) < 100:
return text
sentences = sent_tokenize(text)
if len(sentences) <= sentence_count:
return text
clean_sentences = [s.translate(str.maketrans('', '', string.punctuation)).lower() for s in sentences]
stop_words = set(stopwords.words('english'))
word_frequencies = defaultdict(int)
for sentence in clean_sentences:
for word in word_tokenize(sentence):
if word not in stop_words:
word_frequencies[word] += 1
max_frequency = max(word_frequencies.values()) if word_frequencies else 1
for word in word_frequencies:
word_frequencies[word] = word_frequencies[word] / max_frequency
sentence_scores = defaultdict(int)
for i, sentence in enumerate(clean_sentences):
for word in word_tokenize(sentence):
if word in word_frequencies:
sentence_scores[i] += word_frequencies[word]
top_indices = nlargest(sentence_count, sentence_scores, key=sentence_scores.get)
top_indices.sort()
return ' '.join([sentences[i] for i in top_indices])
def simple_summarize(text, max_chars=1000):
paragraphs = text.split('\n\n')
base_summary = ' '.join(paragraphs[:3])
if len(text) <= max_chars:
return text
if len(base_summary) < max_chars:
remaining_text = ' '.join(paragraphs[3:])
sentences = re.split(r'(?<=[.!?])\s+', remaining_text)
for sentence in sentences:
if len(base_summary) + len(sentence) + 1 <= max_chars:
base_summary += ' ' + sentence
else:
break
if len(base_summary) > max_chars:
base_summary = base_summary[:max_chars] + "..."
return base_summary
if __name__ == "__main__":
# For local testing only
app.run(host="0.0.0.0", port=7860) |