File size: 4,408 Bytes
764d4f7
2a3fae3
b7db40a
764d4f7
2a3fae3
764d4f7
2a3fae3
b7db40a
44d7238
92d0377
524f780
764d4f7
b7db40a
92d0377
 
 
d2d0219
2a3fae3
 
d2d0219
2a3fae3
 
d2d0219
92d0377
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98e82be
 
 
 
3b4df89
2a3fae3
764d4f7
d2d0219
98e82be
764d4f7
2a3fae3
764d4f7
 
98e82be
2a3fae3
 
98e82be
764d4f7
2a3fae3
98e82be
2a3fae3
92d0377
2a3fae3
98e82be
 
 
92d0377
98e82be
92d0377
98e82be
92d0377
98e82be
92d0377
 
 
98e82be
 
764d4f7
92d0377
 
2a3fae3
92d0377
 
d2d0219
92d0377
2a3fae3
92d0377
 
d2d0219
92d0377
2a3fae3
98e82be
 
 
92d0377
98e82be
92d0377
 
b7db40a
92d0377
 
 
53425a8
9fd7d89
92d0377
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import io
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
from PyPDF2 import PdfReader
from docx import Document
from pptx import Presentation
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

app = Flask(__name__)

# Download NLTK data when the app starts
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Allowed file extensions
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}

def allowed_file(filename):
    return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS

# Extractive summarization function
def extractive_summary(text, num_sentences=5):
    """
    Summarizes the given text by selecting the top N most important sentences.
    
    Args:
        text (str): The text to summarize.
        num_sentences (int): Number of sentences to include in the summary (default: 5).
    
    Returns:
        str: The summarized text.
    """
    # Get stop words (e.g., "the", "is") to ignore them
    stop_words = set(stopwords.words('english'))
    
    # Tokenize text into words and sentences
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    
    # If the text has fewer sentences than requested, return the full text
    if len(sentences) <= num_sentences:
        return text
    
    # Calculate word frequencies, excluding stop words and non-alphanumeric characters
    freq_table = {}
    for word in words:
        word = word.lower()
        if word not in stop_words and word.isalnum():
            freq_table[word] = freq_table.get(word, 0) + 1
    
    # Score sentences based on the frequency of their words
    sentence_scores = {}
    for sentence in sentences:
        for word, freq in freq_table.items():
            if word in sentence.lower():
                sentence_scores[sentence] = sentence_scores.get(sentence, 0) + freq
    
    # Select the top N sentences with the highest scores
    summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
    summary = ' '.join(summary_sentences)
    return summary

@app.route("/", methods=["GET"])
def index():
    return "Document Summarizer API is running! Use /summarize endpoint for POST requests."

@app.route("/summarize", methods=["POST"])
def summarize():
    if "file" not in request.files:
        return jsonify({"error": "No file uploaded"}), 400
        
    file = request.files["file"]
    
    if file.filename == "":
        return jsonify({"error": "No selected file"}), 400
        
    if not allowed_file(file.filename):
        return jsonify({"error": "Unsupported file format"}), 400
        
    filename = secure_filename(file.filename)
    file_content = file.read()
    
    # Process file based on type
    summary = None
    file_ext = filename.rsplit(".", 1)[1].lower()
    
    try:
        if file_ext == "pdf":
            summary = summarize_pdf(file_content)
        elif file_ext == "docx":
            summary = summarize_docx(file_content)
        elif file_ext == "pptx":
            summary = summarize_pptx(file_content)
        elif file_ext == "txt":
            summary = summarize_txt(file_content)
            
        return jsonify({"filename": filename, "summary": summary})
    except Exception as e:
        return jsonify({"error": f"Error processing file: {str(e)}"}), 500

# Summarization functions
def summarize_pdf(file_content):
    reader = PdfReader(io.BytesIO(file_content))
    text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
    return extractive_summary(text, num_sentences=5)

def summarize_docx(file_content):
    doc = Document(io.BytesIO(file_content))
    text = "\n".join([para.text for para in doc.paragraphs])
    return extractive_summary(text, num_sentences=5)

def summarize_pptx(file_content):
    ppt = Presentation(io.BytesIO(file_content))
    text = []
    for slide in ppt.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text.append(shape.text)
    full_text = "\n".join(text)
    return extractive_summary(full_text, num_sentences=5)

def summarize_txt(file_content):
    text = file_content.decode("utf-8")
    return extractive_summary(text, num_sentences=5)

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860, debug=True)