ApsidalSolid4 commited on
Commit
33fd63d
·
verified ·
1 Parent(s): 9a1a827

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -103
app.py CHANGED
@@ -176,112 +176,112 @@ class TextClassifier:
176
  'num_windows': len(predictions)
177
  }
178
 
179
- def detailed_scan(self, text: str) -> Dict:
180
- """Original prediction method with modified window handling"""
181
- # Clean up trailing whitespace
182
- text = text.rstrip()
183
-
184
- if not text.strip():
185
- return {
186
- 'sentence_predictions': [],
187
- 'highlighted_text': '',
188
- 'full_text': '',
189
- 'overall_prediction': {
190
- 'prediction': 'unknown',
191
- 'confidence': 0.0,
192
- 'num_sentences': 0
193
- }
194
- }
195
-
196
- self.model.eval()
197
- sentences = self.processor.split_into_sentences(text)
198
- if not sentences:
199
- return {}
200
-
201
- # Create centered windows for each sentence
202
- windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
203
-
204
- # Track scores for each sentence
205
- sentence_appearances = {i: 0 for i in range(len(sentences))}
206
- sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
207
-
208
- # Process windows in batches
209
- batch_size = 16
210
- for i in range(0, len(windows), batch_size):
211
- batch_windows = windows[i:i + batch_size]
212
- batch_indices = window_sentence_indices[i:i + batch_size]
213
-
214
- inputs = self.tokenizer(
215
- batch_windows,
216
- truncation=True,
217
- padding=True,
218
- max_length=MAX_LENGTH,
219
- return_tensors="pt"
220
- ).to(self.device)
221
-
222
- with torch.no_grad():
223
- outputs = self.model(**inputs)
224
- probs = F.softmax(outputs.logits, dim=-1)
225
-
226
- # Attribute predictions with weighted scoring
227
- for window_idx, indices in enumerate(batch_indices):
228
- center_idx = len(indices) // 2
229
- center_weight = 0.7 # Higher weight for center sentence
230
- edge_weight = 0.3 / (len(indices) - 1) # Distribute remaining weight
231
-
232
- for pos, sent_idx in enumerate(indices):
233
- # Apply higher weight to center sentence
234
- weight = center_weight if pos == center_idx else edge_weight
235
- sentence_appearances[sent_idx] += weight
236
- sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
237
- sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
238
-
239
- # Clean up memory
240
- del inputs, outputs, probs
241
- if torch.cuda.is_available():
242
- torch.cuda.empty_cache()
243
-
244
- # Calculate final predictions with boundary smoothing
245
- sentence_predictions = []
246
- for i in range(len(sentences)):
247
- if sentence_appearances[i] > 0:
248
- human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
249
- ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
250
-
251
- # Only apply minimal smoothing at prediction boundaries
252
- if i > 0 and i < len(sentences) - 1:
253
- prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
254
- prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
255
- next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
256
- next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
257
-
258
- # Check if we're at a prediction boundary
259
- current_pred = 'human' if human_prob > ai_prob else 'ai'
260
- prev_pred = 'human' if prev_human > prev_ai else 'ai'
261
- next_pred = 'human' if next_human > next_ai else 'ai'
262
-
263
- if current_pred != prev_pred or current_pred != next_pred:
264
- # Small adjustment at boundaries
265
- smooth_factor = 0.1
266
- human_prob = (human_prob * (1 - smooth_factor) +
267
- (prev_human + next_human) * smooth_factor / 2)
268
- ai_prob = (ai_prob * (1 - smooth_factor) +
269
- (prev_ai + next_ai) * smooth_factor / 2)
270
-
271
- sentence_predictions.append({
272
- 'sentence': sentences[i],
273
- 'human_prob': human_prob,
274
- 'ai_prob': ai_prob,
275
- 'prediction': 'human' if human_prob > ai_prob else 'ai',
276
- 'confidence': max(human_prob, ai_prob)
277
- })
278
 
 
279
  return {
280
- 'sentence_predictions': sentence_predictions,
281
- 'highlighted_text': self.format_predictions_html(sentence_predictions),
282
- 'full_text': text,
283
- 'overall_prediction': self.aggregate_predictions(sentence_predictions)
 
 
 
 
284
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
 
287
  def format_predictions_html(self, sentence_predictions: List[Dict]) -> str:
 
176
  'num_windows': len(predictions)
177
  }
178
 
179
+ def detailed_scan(self, text: str) -> Dict:
180
+ """Original prediction method with modified window handling"""
181
+ # Clean up trailing whitespace
182
+ text = text.rstrip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
+ if not text.strip():
185
  return {
186
+ 'sentence_predictions': [],
187
+ 'highlighted_text': '',
188
+ 'full_text': '',
189
+ 'overall_prediction': {
190
+ 'prediction': 'unknown',
191
+ 'confidence': 0.0,
192
+ 'num_sentences': 0
193
+ }
194
  }
195
+
196
+ self.model.eval()
197
+ sentences = self.processor.split_into_sentences(text)
198
+ if not sentences:
199
+ return {}
200
+
201
+ # Create centered windows for each sentence
202
+ windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
203
+
204
+ # Track scores for each sentence
205
+ sentence_appearances = {i: 0 for i in range(len(sentences))}
206
+ sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
207
+
208
+ # Process windows in batches
209
+ batch_size = 16
210
+ for i in range(0, len(windows), batch_size):
211
+ batch_windows = windows[i:i + batch_size]
212
+ batch_indices = window_sentence_indices[i:i + batch_size]
213
+
214
+ inputs = self.tokenizer(
215
+ batch_windows,
216
+ truncation=True,
217
+ padding=True,
218
+ max_length=MAX_LENGTH,
219
+ return_tensors="pt"
220
+ ).to(self.device)
221
+
222
+ with torch.no_grad():
223
+ outputs = self.model(**inputs)
224
+ probs = F.softmax(outputs.logits, dim=-1)
225
+
226
+ # Attribute predictions with weighted scoring
227
+ for window_idx, indices in enumerate(batch_indices):
228
+ center_idx = len(indices) // 2
229
+ center_weight = 0.7 # Higher weight for center sentence
230
+ edge_weight = 0.3 / (len(indices) - 1) # Distribute remaining weight
231
+
232
+ for pos, sent_idx in enumerate(indices):
233
+ # Apply higher weight to center sentence
234
+ weight = center_weight if pos == center_idx else edge_weight
235
+ sentence_appearances[sent_idx] += weight
236
+ sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
237
+ sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
238
+
239
+ # Clean up memory
240
+ del inputs, outputs, probs
241
+ if torch.cuda.is_available():
242
+ torch.cuda.empty_cache()
243
+
244
+ # Calculate final predictions with boundary smoothing
245
+ sentence_predictions = []
246
+ for i in range(len(sentences)):
247
+ if sentence_appearances[i] > 0:
248
+ human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
249
+ ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
250
+
251
+ # Only apply minimal smoothing at prediction boundaries
252
+ if i > 0 and i < len(sentences) - 1:
253
+ prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
254
+ prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
255
+ next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
256
+ next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
257
+
258
+ # Check if we're at a prediction boundary
259
+ current_pred = 'human' if human_prob > ai_prob else 'ai'
260
+ prev_pred = 'human' if prev_human > prev_ai else 'ai'
261
+ next_pred = 'human' if next_human > next_ai else 'ai'
262
+
263
+ if current_pred != prev_pred or current_pred != next_pred:
264
+ # Small adjustment at boundaries
265
+ smooth_factor = 0.1
266
+ human_prob = (human_prob * (1 - smooth_factor) +
267
+ (prev_human + next_human) * smooth_factor / 2)
268
+ ai_prob = (ai_prob * (1 - smooth_factor) +
269
+ (prev_ai + next_ai) * smooth_factor / 2)
270
+
271
+ sentence_predictions.append({
272
+ 'sentence': sentences[i],
273
+ 'human_prob': human_prob,
274
+ 'ai_prob': ai_prob,
275
+ 'prediction': 'human' if human_prob > ai_prob else 'ai',
276
+ 'confidence': max(human_prob, ai_prob)
277
+ })
278
+
279
+ return {
280
+ 'sentence_predictions': sentence_predictions,
281
+ 'highlighted_text': self.format_predictions_html(sentence_predictions),
282
+ 'full_text': text,
283
+ 'overall_prediction': self.aggregate_predictions(sentence_predictions)
284
+ }
285
 
286
 
287
  def format_predictions_html(self, sentence_predictions: List[Dict]) -> str: