File size: 6,095 Bytes
764d4f7
2a3fae3
b7db40a
 
 
764d4f7
2a3fae3
764d4f7
2a3fae3
b7db40a
 
44d7238
 
 
 
 
524f780
764d4f7
b7db40a
 
44d7238
53425a8
 
 
 
44d7238
b7db40a
44d7238
 
 
 
d2d0219
2a3fae3
 
d2d0219
2a3fae3
 
d2d0219
98e82be
 
 
 
3b4df89
2a3fae3
764d4f7
d2d0219
98e82be
764d4f7
2a3fae3
764d4f7
 
98e82be
2a3fae3
 
98e82be
764d4f7
2a3fae3
98e82be
2a3fae3
b7db40a
2a3fae3
98e82be
 
 
b7db40a
98e82be
b7db40a
98e82be
b7db40a
98e82be
b7db40a
 
 
53425a8
44d7238
 
 
 
53425a8
 
 
b7db40a
 
 
 
 
 
 
 
53425a8
b7db40a
98e82be
 
764d4f7
53425a8
b7db40a
2a3fae3
b7db40a
 
 
 
 
 
d2d0219
b7db40a
2a3fae3
b7db40a
 
d2d0219
b7db40a
2a3fae3
98e82be
 
 
b7db40a
98e82be
b7db40a
3b4df89
b7db40a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44d7238
b7db40a
44d7238
3b4df89
53425a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9fd7d89
44d7238
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import os
import io
import re
from flask import Flask, request, jsonify
from flask_cors import CORS
from werkzeug.utils import secure_filename
from PyPDF2 import PdfReader
from docx import Document
from pptx import Presentation
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from heapq import nlargest
from collections import defaultdict

app = Flask(__name__)
CORS(app)  # Enable CORS for all routes

# Set NLTK data path to a directory included in the project
nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
os.makedirs(nltk_data_dir, exist_ok=True)
nltk.data.path.append(nltk_data_dir)

# Ensure NLTK data is available (pre-downloaded)
try:
    stopwords.words('english')  # Test if stopwords are accessible
except LookupError:
    print("NLTK data not found. Please ensure 'punkt' and 'stopwords' are pre-downloaded in 'nltk_data'.")
    # Fallback will be used if this fails

# Allowed file extensions
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}

def allowed_file(filename):
    return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS

@app.route("/", methods=["GET"])
def index():
    return "Document Summarizer API is running! Use /summarize endpoint for POST requests."

@app.route("/summarize", methods=["POST"])
def summarize():
    if "file" not in request.files:
        return jsonify({"error": "No file uploaded"}), 400
        
    file = request.files["file"]
    
    if file.filename == "":
        return jsonify({"error": "No selected file"}), 400
        
    if not allowed_file(file.filename):
        return jsonify({"error": "Unsupported file format"}), 400
        
    filename = secure_filename(file.filename)
    file_content = file.read()
    
    # Process file based on type
    text = None
    file_ext = filename.rsplit(".", 1)[1].lower()
    
    try:
        if file_ext == "pdf":
            text = extract_text_from_pdf(file_content)
        elif file_ext == "docx":
            text = extract_text_from_docx(file_content)
        elif file_ext == "pptx":
            text = extract_text_from_pptx(file_content)
        elif file_ext == "txt":
            text = extract_text_from_txt(file_content)
        
        # Generate a summary of the text
        try:
            summary = generate_summary(text)
        except LookupError as e:
            print(f"NLTK summarization failed: {e}. Using fallback.")
            summary = simple_summarize(text)
        except Exception as e:
            print(f"Summarization error: {e}")
            summary = text[:1000] + "..." if len(text) > 1000 else text
        
        # Include metadata
        word_count = len(text.split())
        
        return jsonify({
            "filename": filename,
            "summary": summary,
            "original_word_count": word_count,
            "summary_word_count": len(summary.split()) if summary else 0
        })
    except Exception as e:
        return jsonify({"error": f"Error processing file: {str(e)}"}), 500

# Text extraction functions
def extract_text_from_pdf(file_content):
    reader = PdfReader(io.BytesIO(file_content))
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n\n"
    return clean_text(text)

def extract_text_from_docx(file_content):
    doc = Document(io.BytesIO(file_content))
    text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
    return clean_text(text)

def extract_text_from_pptx(file_content):
    ppt = Presentation(io.BytesIO(file_content))
    text = []
    for slide in ppt.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text") and shape.text.strip():
                text.append(shape.text)
    return clean_text("\n".join(text))

def extract_text_from_txt(file_content):
    text = file_content.decode("utf-8", errors="ignore")
    return clean_text(text)

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s\.\,\!\?\:\;]', '', text)
    return text.strip()

def generate_summary(text, sentence_count=5):
    if len(text.split()) < 100:
        return text
    
    sentences = sent_tokenize(text)
    if len(sentences) <= sentence_count:
        return text
    
    clean_sentences = [s.translate(str.maketrans('', '', string.punctuation)).lower() for s in sentences]
    stop_words = set(stopwords.words('english'))
    
    word_frequencies = defaultdict(int)
    for sentence in clean_sentences:
        for word in word_tokenize(sentence):
            if word not in stop_words:
                word_frequencies[word] += 1
    
    max_frequency = max(word_frequencies.values()) if word_frequencies else 1
    for word in word_frequencies:
        word_frequencies[word] = word_frequencies[word] / max_frequency
    
    sentence_scores = defaultdict(int)
    for i, sentence in enumerate(clean_sentences):
        for word in word_tokenize(sentence):
            if word in word_frequencies:
                sentence_scores[i] += word_frequencies[word]
    
    top_indices = nlargest(sentence_count, sentence_scores, key=sentence_scores.get)
    top_indices.sort()
    
    return ' '.join([sentences[i] for i in top_indices])

def simple_summarize(text, max_chars=1000):
    paragraphs = text.split('\n\n')
    base_summary = ' '.join(paragraphs[:3])
    
    if len(text) <= max_chars:
        return text
    
    if len(base_summary) < max_chars:
        remaining_text = ' '.join(paragraphs[3:])
        sentences = re.split(r'(?<=[.!?])\s+', remaining_text)
        for sentence in sentences:
            if len(base_summary) + len(sentence) + 1 <= max_chars:
                base_summary += ' ' + sentence
            else:
                break
    
    if len(base_summary) > max_chars:
        base_summary = base_summary[:max_chars] + "..."
        
    return base_summary

if __name__ == "__main__":
    # For local testing only
    app.run(host="0.0.0.0", port=7860)