ApsidalSolid4 commited on
Commit
dd69342
·
verified ·
1 Parent(s): 17941c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -67
app.py CHANGED
@@ -176,8 +176,8 @@ class TextClassifier:
176
  'num_windows': len(predictions)
177
  }
178
 
179
- def predict_with_local_context(self, text: str) -> Dict:
180
- """Enhanced prediction that maintains high confidence while preventing bleeding"""
181
  if self.model is None or self.tokenizer is None:
182
  self.load_model()
183
 
@@ -186,19 +186,21 @@ class TextClassifier:
186
  if not sentences:
187
  return {}
188
 
189
- # Initialize scores for each sentence
190
- sentence_predictions = []
191
-
192
- # First pass: Get base predictions for each sentence
193
- for i in range(len(sentences)):
194
- # Get a small window around the current sentence
195
- start_idx = max(0, i - 1)
196
- end_idx = min(len(sentences), i + 2)
197
- window = sentences[start_idx:end_idx]
198
-
199
- # Get model prediction for this window
 
 
200
  inputs = self.tokenizer(
201
- " ".join(window),
202
  truncation=True,
203
  padding=True,
204
  max_length=MAX_LENGTH,
@@ -208,11 +210,51 @@ class TextClassifier:
208
  with torch.no_grad():
209
  outputs = self.model(**inputs)
210
  probs = F.softmax(outputs.logits, dim=-1)
211
-
212
- # Extract probabilities
213
- human_prob = probs[0][1].item()
214
- ai_prob = probs[0][0].item()
215
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  sentence_predictions.append({
217
  'sentence': sentences[i],
218
  'human_prob': human_prob,
@@ -221,55 +263,11 @@ class TextClassifier:
221
  'confidence': max(human_prob, ai_prob)
222
  })
223
 
224
- del inputs, outputs, probs
225
- if torch.cuda.is_available():
226
- torch.cuda.empty_cache()
227
-
228
- # Second pass: Minimal smoothing only at significant prediction boundaries
229
- smoothed_predictions = []
230
- for i in range(len(sentence_predictions)):
231
- pred = sentence_predictions[i].copy()
232
-
233
- # Only apply smoothing if this sentence is at a prediction boundary
234
- if i > 0 and i < len(sentence_predictions) - 1:
235
- prev_pred = sentence_predictions[i-1]
236
- next_pred = sentence_predictions[i+1]
237
-
238
- # Check if we're at a prediction boundary
239
- at_boundary = (
240
- pred['prediction'] != prev_pred['prediction'] or
241
- pred['prediction'] != next_pred['prediction']
242
- )
243
-
244
- if at_boundary:
245
- # Calculate average confidence of neighbors
246
- neighbor_conf = (prev_pred['confidence'] + next_pred['confidence']) / 2
247
-
248
- # If neighbors are very confident and different from current prediction,
249
- # slightly adjust current prediction
250
- if neighbor_conf > 0.85 and pred['confidence'] < 0.75:
251
- # Adjust probabilities slightly toward neighbors
252
- weight = 0.15 # Small adjustment weight
253
- pred['human_prob'] = (
254
- pred['human_prob'] * (1 - weight) +
255
- ((prev_pred['human_prob'] + next_pred['human_prob']) / 2) * weight
256
- )
257
- pred['ai_prob'] = (
258
- pred['ai_prob'] * (1 - weight) +
259
- ((prev_pred['ai_prob'] + next_pred['ai_prob']) / 2) * weight
260
- )
261
-
262
- # Update prediction and confidence
263
- pred['prediction'] = 'human' if pred['human_prob'] > pred['ai_prob'] else 'ai'
264
- pred['confidence'] = max(pred['human_prob'], pred['ai_prob'])
265
-
266
- smoothed_predictions.append(pred)
267
-
268
  return {
269
- 'sentence_predictions': smoothed_predictions,
270
- 'highlighted_text': self.format_predictions_html(smoothed_predictions),
271
  'full_text': text,
272
- 'overall_prediction': self.aggregate_predictions(smoothed_predictions)
273
  }
274
 
275
  def detailed_scan(self, text: str) -> Dict:
@@ -436,7 +434,7 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
436
  quick_analysis
437
  )
438
  else:
439
- analysis = classifier.predict_with_local_context(text)
440
 
441
  detailed_analysis = []
442
  for pred in analysis['sentence_predictions']:
 
176
  'num_windows': len(predictions)
177
  }
178
 
179
+ def detailed_scan(self, text: str) -> Dict:
180
+ """Original prediction method with modified window handling"""
181
  if self.model is None or self.tokenizer is None:
182
  self.load_model()
183
 
 
186
  if not sentences:
187
  return {}
188
 
189
+ # Create centered windows for each sentence
190
+ windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
191
+
192
+ # Track scores for each sentence
193
+ sentence_appearances = {i: 0 for i in range(len(sentences))}
194
+ sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
195
+
196
+ # Process windows in batches
197
+ batch_size = 16
198
+ for i in range(0, len(windows), batch_size):
199
+ batch_windows = windows[i:i + batch_size]
200
+ batch_indices = window_sentence_indices[i:i + batch_size]
201
+
202
  inputs = self.tokenizer(
203
+ batch_windows,
204
  truncation=True,
205
  padding=True,
206
  max_length=MAX_LENGTH,
 
210
  with torch.no_grad():
211
  outputs = self.model(**inputs)
212
  probs = F.softmax(outputs.logits, dim=-1)
213
+
214
+ # Attribute predictions more carefully
215
+ for window_idx, indices in enumerate(batch_indices):
216
+ center_idx = len(indices) // 2
217
+ center_weight = 0.7 # Higher weight for center sentence
218
+ edge_weight = 0.3 / (len(indices) - 1) # Distribute remaining weight
219
+
220
+ for pos, sent_idx in enumerate(indices):
221
+ # Apply higher weight to center sentence
222
+ weight = center_weight if pos == center_idx else edge_weight
223
+ sentence_appearances[sent_idx] += weight
224
+ sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
225
+ sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
226
+
227
+ del inputs, outputs, probs
228
+ if torch.cuda.is_available():
229
+ torch.cuda.empty_cache()
230
+
231
+ # Calculate final predictions
232
+ sentence_predictions = []
233
+ for i in range(len(sentences)):
234
+ if sentence_appearances[i] > 0:
235
+ human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
236
+ ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
237
+
238
+ # Only apply minimal smoothing at prediction boundaries
239
+ if i > 0 and i < len(sentences) - 1:
240
+ prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
241
+ prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
242
+ next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
243
+ next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
244
+
245
+ # Check if we're at a prediction boundary
246
+ current_pred = 'human' if human_prob > ai_prob else 'ai'
247
+ prev_pred = 'human' if prev_human > prev_ai else 'ai'
248
+ next_pred = 'human' if next_human > next_ai else 'ai'
249
+
250
+ if current_pred != prev_pred or current_pred != next_pred:
251
+ # Small adjustment at boundaries
252
+ smooth_factor = 0.1
253
+ human_prob = (human_prob * (1 - smooth_factor) +
254
+ (prev_human + next_human) * smooth_factor / 2)
255
+ ai_prob = (ai_prob * (1 - smooth_factor) +
256
+ (prev_ai + next_ai) * smooth_factor / 2)
257
+
258
  sentence_predictions.append({
259
  'sentence': sentences[i],
260
  'human_prob': human_prob,
 
263
  'confidence': max(human_prob, ai_prob)
264
  })
265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  return {
267
+ 'sentence_predictions': sentence_predictions,
268
+ 'highlighted_text': self.format_predictions_html(sentence_predictions),
269
  'full_text': text,
270
+ 'overall_prediction': self.aggregate_predictions(sentence_predictions)
271
  }
272
 
273
  def detailed_scan(self, text: str) -> Dict:
 
434
  quick_analysis
435
  )
436
  else:
437
+ analysis = classifier.predict_with_local_context(text)
438
 
439
  detailed_analysis = []
440
  for pred in analysis['sentence_predictions']: