mike23415 commited on
Commit
53425a8
·
verified ·
1 Parent(s): b7db40a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -14
app.py CHANGED
@@ -8,23 +8,32 @@ from PyPDF2 import PdfReader
8
  from docx import Document
9
  from pptx import Presentation
10
  import nltk
11
- from nltk.corpus import stopwords
12
- from nltk.tokenize import sent_tokenize, word_tokenize
13
- from nltk.probability import FreqDist
14
- from heapq import nlargest
15
- from collections import defaultdict
16
  import string
17
 
18
  app = Flask(__name__)
19
  CORS(app) # Enable CORS for all routes
20
 
21
- # Download necessary NLTK data
 
 
 
 
 
22
  try:
23
- nltk.data.find('tokenizers/punkt')
24
- nltk.data.find('corpora/stopwords')
25
- except LookupError:
26
- nltk.download('punkt')
27
- nltk.download('stopwords')
 
 
 
 
 
 
 
 
 
28
 
29
  # Allowed file extensions
30
  ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
@@ -67,7 +76,17 @@ def summarize():
67
  text = extract_text_from_txt(file_content)
68
 
69
  # Generate a summary of the text
70
- summary = generate_summary(text)
 
 
 
 
 
 
 
 
 
 
71
 
72
  # Include metadata
73
  word_count = len(text.split())
@@ -76,12 +95,12 @@ def summarize():
76
  "filename": filename,
77
  "summary": summary,
78
  "original_word_count": word_count,
79
- "summary_word_count": len(summary.split())
80
  })
81
  except Exception as e:
82
  return jsonify({"error": f"Error processing file: {str(e)}"}), 500
83
 
84
- # Improved text extraction functions
85
  def extract_text_from_pdf(file_content):
86
  reader = PdfReader(io.BytesIO(file_content))
87
  text = ""
@@ -162,5 +181,33 @@ def generate_summary(text, sentence_count=5):
162
 
163
  return summary
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  if __name__ == "__main__":
166
  app.run(host="0.0.0.0", port=7860, debug=True)
 
8
  from docx import Document
9
  from pptx import Presentation
10
  import nltk
 
 
 
 
 
11
  import string
12
 
13
  app = Flask(__name__)
14
  CORS(app) # Enable CORS for all routes
15
 
16
+ # Set NLTK data path to a directory where we have write access
17
+ nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
18
+ os.makedirs(nltk_data_dir, exist_ok=True)
19
+ nltk.data.path.append(nltk_data_dir)
20
+
21
+ # Download necessary NLTK data to our custom directory
22
  try:
23
+ nltk.download('punkt', download_dir=nltk_data_dir, quiet=True)
24
+ nltk.download('stopwords', download_dir=nltk_data_dir, quiet=True)
25
+ from nltk.corpus import stopwords
26
+ from nltk.tokenize import sent_tokenize, word_tokenize
27
+ from nltk.probability import FreqDist
28
+ from heapq import nlargest
29
+ from collections import defaultdict
30
+ except Exception as e:
31
+ print(f"NLTK download error: {e}")
32
+ # Fallback function if NLTK data cannot be downloaded
33
+ def simple_summarize(text, max_chars=1000):
34
+ # Simple summary: first few paragraphs
35
+ paragraphs = text.split('\n\n')
36
+ return ' '.join(paragraphs[:3])[:max_chars]
37
 
38
  # Allowed file extensions
39
  ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
 
76
  text = extract_text_from_txt(file_content)
77
 
78
  # Generate a summary of the text
79
+ try:
80
+ # Try using the advanced summarizer
81
+ if 'stopwords' in globals() and 'sent_tokenize' in globals():
82
+ summary = generate_summary(text)
83
+ else:
84
+ # Fallback to simple summarizer if NLTK isn't available
85
+ summary = simple_summarize(text)
86
+ except Exception as e:
87
+ print(f"Summarization error: {e}")
88
+ # Another fallback if something goes wrong with summarization
89
+ summary = text[:1000] + "..." if len(text) > 1000 else text
90
 
91
  # Include metadata
92
  word_count = len(text.split())
 
95
  "filename": filename,
96
  "summary": summary,
97
  "original_word_count": word_count,
98
+ "summary_word_count": len(summary.split()) if summary else 0
99
  })
100
  except Exception as e:
101
  return jsonify({"error": f"Error processing file: {str(e)}"}), 500
102
 
103
+ # Text extraction functions
104
  def extract_text_from_pdf(file_content):
105
  reader = PdfReader(io.BytesIO(file_content))
106
  text = ""
 
181
 
182
  return summary
183
 
184
+ def simple_summarize(text, max_chars=1000):
185
+ # Simple summary: first few paragraphs plus additional key sentences
186
+ paragraphs = text.split('\n\n')
187
+
188
+ # Take first 2-3 paragraphs as a base summary
189
+ base_summary = ' '.join(paragraphs[:3])
190
+
191
+ # If we have a very short text, just return it
192
+ if len(text) <= max_chars:
193
+ return text
194
+
195
+ # If base summary is too short, add more content up to max_chars
196
+ if len(base_summary) < max_chars:
197
+ remaining_text = ' '.join(paragraphs[3:])
198
+ sentences = re.split(r'(?<=[.!?])\s+', remaining_text)
199
+
200
+ for sentence in sentences:
201
+ if len(base_summary) + len(sentence) + 1 <= max_chars:
202
+ base_summary += ' ' + sentence
203
+ else:
204
+ break
205
+
206
+ # Truncate if still too long
207
+ if len(base_summary) > max_chars:
208
+ base_summary = base_summary[:max_chars] + "..."
209
+
210
+ return base_summary
211
+
212
  if __name__ == "__main__":
213
  app.run(host="0.0.0.0", port=7860, debug=True)