Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -8,23 +8,32 @@ from PyPDF2 import PdfReader
|
|
8 |
from docx import Document
|
9 |
from pptx import Presentation
|
10 |
import nltk
|
11 |
-
from nltk.corpus import stopwords
|
12 |
-
from nltk.tokenize import sent_tokenize, word_tokenize
|
13 |
-
from nltk.probability import FreqDist
|
14 |
-
from heapq import nlargest
|
15 |
-
from collections import defaultdict
|
16 |
import string
|
17 |
|
18 |
app = Flask(__name__)
|
19 |
CORS(app) # Enable CORS for all routes
|
20 |
|
21 |
-
#
|
|
|
|
|
|
|
|
|
|
|
22 |
try:
|
23 |
-
nltk.
|
24 |
-
nltk.
|
25 |
-
|
26 |
-
nltk.
|
27 |
-
nltk.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
# Allowed file extensions
|
30 |
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
|
@@ -67,7 +76,17 @@ def summarize():
|
|
67 |
text = extract_text_from_txt(file_content)
|
68 |
|
69 |
# Generate a summary of the text
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
# Include metadata
|
73 |
word_count = len(text.split())
|
@@ -76,12 +95,12 @@ def summarize():
|
|
76 |
"filename": filename,
|
77 |
"summary": summary,
|
78 |
"original_word_count": word_count,
|
79 |
-
"summary_word_count": len(summary.split())
|
80 |
})
|
81 |
except Exception as e:
|
82 |
return jsonify({"error": f"Error processing file: {str(e)}"}), 500
|
83 |
|
84 |
-
#
|
85 |
def extract_text_from_pdf(file_content):
|
86 |
reader = PdfReader(io.BytesIO(file_content))
|
87 |
text = ""
|
@@ -162,5 +181,33 @@ def generate_summary(text, sentence_count=5):
|
|
162 |
|
163 |
return summary
|
164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
if __name__ == "__main__":
|
166 |
app.run(host="0.0.0.0", port=7860, debug=True)
|
|
|
8 |
from docx import Document
|
9 |
from pptx import Presentation
|
10 |
import nltk
|
|
|
|
|
|
|
|
|
|
|
11 |
import string
|
12 |
|
13 |
app = Flask(__name__)
|
14 |
CORS(app) # Enable CORS for all routes
|
15 |
|
16 |
+
# Set NLTK data path to a directory where we have write access
|
17 |
+
nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
|
18 |
+
os.makedirs(nltk_data_dir, exist_ok=True)
|
19 |
+
nltk.data.path.append(nltk_data_dir)
|
20 |
+
|
21 |
+
# Download necessary NLTK data to our custom directory
|
22 |
try:
|
23 |
+
nltk.download('punkt', download_dir=nltk_data_dir, quiet=True)
|
24 |
+
nltk.download('stopwords', download_dir=nltk_data_dir, quiet=True)
|
25 |
+
from nltk.corpus import stopwords
|
26 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
27 |
+
from nltk.probability import FreqDist
|
28 |
+
from heapq import nlargest
|
29 |
+
from collections import defaultdict
|
30 |
+
except Exception as e:
|
31 |
+
print(f"NLTK download error: {e}")
|
32 |
+
# Fallback function if NLTK data cannot be downloaded
|
33 |
+
def simple_summarize(text, max_chars=1000):
|
34 |
+
# Simple summary: first few paragraphs
|
35 |
+
paragraphs = text.split('\n\n')
|
36 |
+
return ' '.join(paragraphs[:3])[:max_chars]
|
37 |
|
38 |
# Allowed file extensions
|
39 |
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
|
|
|
76 |
text = extract_text_from_txt(file_content)
|
77 |
|
78 |
# Generate a summary of the text
|
79 |
+
try:
|
80 |
+
# Try using the advanced summarizer
|
81 |
+
if 'stopwords' in globals() and 'sent_tokenize' in globals():
|
82 |
+
summary = generate_summary(text)
|
83 |
+
else:
|
84 |
+
# Fallback to simple summarizer if NLTK isn't available
|
85 |
+
summary = simple_summarize(text)
|
86 |
+
except Exception as e:
|
87 |
+
print(f"Summarization error: {e}")
|
88 |
+
# Another fallback if something goes wrong with summarization
|
89 |
+
summary = text[:1000] + "..." if len(text) > 1000 else text
|
90 |
|
91 |
# Include metadata
|
92 |
word_count = len(text.split())
|
|
|
95 |
"filename": filename,
|
96 |
"summary": summary,
|
97 |
"original_word_count": word_count,
|
98 |
+
"summary_word_count": len(summary.split()) if summary else 0
|
99 |
})
|
100 |
except Exception as e:
|
101 |
return jsonify({"error": f"Error processing file: {str(e)}"}), 500
|
102 |
|
103 |
+
# Text extraction functions
|
104 |
def extract_text_from_pdf(file_content):
|
105 |
reader = PdfReader(io.BytesIO(file_content))
|
106 |
text = ""
|
|
|
181 |
|
182 |
return summary
|
183 |
|
184 |
+
def simple_summarize(text, max_chars=1000):
|
185 |
+
# Simple summary: first few paragraphs plus additional key sentences
|
186 |
+
paragraphs = text.split('\n\n')
|
187 |
+
|
188 |
+
# Take first 2-3 paragraphs as a base summary
|
189 |
+
base_summary = ' '.join(paragraphs[:3])
|
190 |
+
|
191 |
+
# If we have a very short text, just return it
|
192 |
+
if len(text) <= max_chars:
|
193 |
+
return text
|
194 |
+
|
195 |
+
# If base summary is too short, add more content up to max_chars
|
196 |
+
if len(base_summary) < max_chars:
|
197 |
+
remaining_text = ' '.join(paragraphs[3:])
|
198 |
+
sentences = re.split(r'(?<=[.!?])\s+', remaining_text)
|
199 |
+
|
200 |
+
for sentence in sentences:
|
201 |
+
if len(base_summary) + len(sentence) + 1 <= max_chars:
|
202 |
+
base_summary += ' ' + sentence
|
203 |
+
else:
|
204 |
+
break
|
205 |
+
|
206 |
+
# Truncate if still too long
|
207 |
+
if len(base_summary) > max_chars:
|
208 |
+
base_summary = base_summary[:max_chars] + "..."
|
209 |
+
|
210 |
+
return base_summary
|
211 |
+
|
212 |
if __name__ == "__main__":
|
213 |
app.run(host="0.0.0.0", port=7860, debug=True)
|