mike23415 commited on
Commit
44d7238
·
verified ·
1 Parent(s): 53425a8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -49
app.py CHANGED
@@ -9,31 +9,26 @@ from docx import Document
9
  from pptx import Presentation
10
  import nltk
11
  import string
 
 
 
 
 
12
 
13
  app = Flask(__name__)
14
  CORS(app) # Enable CORS for all routes
15
 
16
- # Set NLTK data path to a directory where we have write access
17
  nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
18
  os.makedirs(nltk_data_dir, exist_ok=True)
19
  nltk.data.path.append(nltk_data_dir)
20
 
21
- # Download necessary NLTK data to our custom directory
22
  try:
23
- nltk.download('punkt', download_dir=nltk_data_dir, quiet=True)
24
- nltk.download('stopwords', download_dir=nltk_data_dir, quiet=True)
25
- from nltk.corpus import stopwords
26
- from nltk.tokenize import sent_tokenize, word_tokenize
27
- from nltk.probability import FreqDist
28
- from heapq import nlargest
29
- from collections import defaultdict
30
- except Exception as e:
31
- print(f"NLTK download error: {e}")
32
- # Fallback function if NLTK data cannot be downloaded
33
- def simple_summarize(text, max_chars=1000):
34
- # Simple summary: first few paragraphs
35
- paragraphs = text.split('\n\n')
36
- return ' '.join(paragraphs[:3])[:max_chars]
37
 
38
  # Allowed file extensions
39
  ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
@@ -77,15 +72,12 @@ def summarize():
77
 
78
  # Generate a summary of the text
79
  try:
80
- # Try using the advanced summarizer
81
- if 'stopwords' in globals() and 'sent_tokenize' in globals():
82
- summary = generate_summary(text)
83
- else:
84
- # Fallback to simple summarizer if NLTK isn't available
85
- summary = simple_summarize(text)
86
  except Exception as e:
87
  print(f"Summarization error: {e}")
88
- # Another fallback if something goes wrong with summarization
89
  summary = text[:1000] + "..." if len(text) > 1000 else text
90
 
91
  # Include metadata
@@ -129,85 +121,63 @@ def extract_text_from_txt(file_content):
129
  return clean_text(text)
130
 
131
  def clean_text(text):
132
- # Remove excess whitespace
133
  text = re.sub(r'\s+', ' ', text)
134
- # Remove special characters but keep sentence punctuation
135
  text = re.sub(r'[^\w\s\.\,\!\?\:\;]', '', text)
136
  return text.strip()
137
 
138
  def generate_summary(text, sentence_count=5):
139
- # If text is very short, return it as is
140
  if len(text.split()) < 100:
141
  return text
142
 
143
- # Tokenize the text into sentences
144
  sentences = sent_tokenize(text)
145
-
146
- # If too few sentences, return all
147
  if len(sentences) <= sentence_count:
148
  return text
149
 
150
- # Remove punctuation and convert to lowercase for processing
151
  clean_sentences = [s.translate(str.maketrans('', '', string.punctuation)).lower() for s in sentences]
152
-
153
- # Get stop words
154
  stop_words = set(stopwords.words('english'))
155
 
156
- # Calculate word frequencies excluding stop words
157
  word_frequencies = defaultdict(int)
158
  for sentence in clean_sentences:
159
  for word in word_tokenize(sentence):
160
  if word not in stop_words:
161
  word_frequencies[word] += 1
162
 
163
- # Normalize frequencies
164
  max_frequency = max(word_frequencies.values()) if word_frequencies else 1
165
  for word in word_frequencies:
166
  word_frequencies[word] = word_frequencies[word] / max_frequency
167
 
168
- # Calculate sentence scores based on word frequencies
169
  sentence_scores = defaultdict(int)
170
  for i, sentence in enumerate(clean_sentences):
171
  for word in word_tokenize(sentence):
172
  if word in word_frequencies:
173
  sentence_scores[i] += word_frequencies[word]
174
 
175
- # Get top sentences
176
  top_indices = nlargest(sentence_count, sentence_scores, key=sentence_scores.get)
177
- top_indices.sort() # Sort to maintain original order
178
 
179
- # Combine top sentences to form summary
180
- summary = ' '.join([sentences[i] for i in top_indices])
181
-
182
- return summary
183
 
184
  def simple_summarize(text, max_chars=1000):
185
- # Simple summary: first few paragraphs plus additional key sentences
186
  paragraphs = text.split('\n\n')
187
-
188
- # Take first 2-3 paragraphs as a base summary
189
  base_summary = ' '.join(paragraphs[:3])
190
 
191
- # If we have a very short text, just return it
192
  if len(text) <= max_chars:
193
  return text
194
 
195
- # If base summary is too short, add more content up to max_chars
196
  if len(base_summary) < max_chars:
197
  remaining_text = ' '.join(paragraphs[3:])
198
  sentences = re.split(r'(?<=[.!?])\s+', remaining_text)
199
-
200
  for sentence in sentences:
201
  if len(base_summary) + len(sentence) + 1 <= max_chars:
202
  base_summary += ' ' + sentence
203
  else:
204
  break
205
 
206
- # Truncate if still too long
207
  if len(base_summary) > max_chars:
208
  base_summary = base_summary[:max_chars] + "..."
209
 
210
  return base_summary
211
 
212
  if __name__ == "__main__":
213
- app.run(host="0.0.0.0", port=7860, debug=True)
 
 
9
  from pptx import Presentation
10
  import nltk
11
  import string
12
+ from nltk.corpus import stopwords
13
+ from nltk.tokenize import sent_tokenize, word_tokenize
14
+ from nltk.probability import FreqDist
15
+ from heapq import nlargest
16
+ from collections import defaultdict
17
 
18
  app = Flask(__name__)
19
  CORS(app) # Enable CORS for all routes
20
 
21
+ # Set NLTK data path to a directory included in the project
22
  nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
23
  os.makedirs(nltk_data_dir, exist_ok=True)
24
  nltk.data.path.append(nltk_data_dir)
25
 
26
+ # Ensure NLTK data is available (pre-downloaded)
27
  try:
28
+ stopwords.words('english') # Test if stopwords are accessible
29
+ except LookupError:
30
+ print("NLTK data not found. Please ensure 'punkt' and 'stopwords' are pre-downloaded in 'nltk_data'.")
31
+ # Fallback will be used if this fails
 
 
 
 
 
 
 
 
 
 
32
 
33
  # Allowed file extensions
34
  ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
 
72
 
73
  # Generate a summary of the text
74
  try:
75
+ summary = generate_summary(text)
76
+ except LookupError as e:
77
+ print(f"NLTK summarization failed: {e}. Using fallback.")
78
+ summary = simple_summarize(text)
 
 
79
  except Exception as e:
80
  print(f"Summarization error: {e}")
 
81
  summary = text[:1000] + "..." if len(text) > 1000 else text
82
 
83
  # Include metadata
 
121
  return clean_text(text)
122
 
123
  def clean_text(text):
 
124
  text = re.sub(r'\s+', ' ', text)
 
125
  text = re.sub(r'[^\w\s\.\,\!\?\:\;]', '', text)
126
  return text.strip()
127
 
128
  def generate_summary(text, sentence_count=5):
 
129
  if len(text.split()) < 100:
130
  return text
131
 
 
132
  sentences = sent_tokenize(text)
 
 
133
  if len(sentences) <= sentence_count:
134
  return text
135
 
 
136
  clean_sentences = [s.translate(str.maketrans('', '', string.punctuation)).lower() for s in sentences]
 
 
137
  stop_words = set(stopwords.words('english'))
138
 
 
139
  word_frequencies = defaultdict(int)
140
  for sentence in clean_sentences:
141
  for word in word_tokenize(sentence):
142
  if word not in stop_words:
143
  word_frequencies[word] += 1
144
 
 
145
  max_frequency = max(word_frequencies.values()) if word_frequencies else 1
146
  for word in word_frequencies:
147
  word_frequencies[word] = word_frequencies[word] / max_frequency
148
 
 
149
  sentence_scores = defaultdict(int)
150
  for i, sentence in enumerate(clean_sentences):
151
  for word in word_tokenize(sentence):
152
  if word in word_frequencies:
153
  sentence_scores[i] += word_frequencies[word]
154
 
 
155
  top_indices = nlargest(sentence_count, sentence_scores, key=sentence_scores.get)
156
+ top_indices.sort()
157
 
158
+ return ' '.join([sentences[i] for i in top_indices])
 
 
 
159
 
160
  def simple_summarize(text, max_chars=1000):
 
161
  paragraphs = text.split('\n\n')
 
 
162
  base_summary = ' '.join(paragraphs[:3])
163
 
 
164
  if len(text) <= max_chars:
165
  return text
166
 
 
167
  if len(base_summary) < max_chars:
168
  remaining_text = ' '.join(paragraphs[3:])
169
  sentences = re.split(r'(?<=[.!?])\s+', remaining_text)
 
170
  for sentence in sentences:
171
  if len(base_summary) + len(sentence) + 1 <= max_chars:
172
  base_summary += ' ' + sentence
173
  else:
174
  break
175
 
 
176
  if len(base_summary) > max_chars:
177
  base_summary = base_summary[:max_chars] + "..."
178
 
179
  return base_summary
180
 
181
  if __name__ == "__main__":
182
+ # For local testing only
183
+ app.run(host="0.0.0.0", port=7860)