ApsidalSolid4 commited on
Commit
59eee68
·
verified ·
1 Parent(s): 53f5f55

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -2
app.py CHANGED
@@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
18
  # Constants
19
  MAX_LENGTH = 512
20
  MODEL_NAME = "microsoft/deberta-v3-small"
21
- WINDOW_SIZE = 17
22
  WINDOW_OVERLAP = 2
23
  CONFIDENCE_THRESHOLD = 0.65
24
  BATCH_SIZE = 8 # Reduced batch size for CPU
@@ -176,6 +176,102 @@ class TextClassifier:
176
  'num_windows': len(predictions)
177
  }
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  def detailed_scan(self, text: str) -> Dict:
180
  """Perform a detailed scan with improved sentence-level analysis."""
181
  if not text.strip():
@@ -340,7 +436,7 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
340
  quick_analysis
341
  )
342
  else:
343
- analysis = classifier.detailed_scan(text)
344
 
345
  detailed_analysis = []
346
  for pred in analysis['sentence_predictions']:
 
18
  # Constants
19
  MAX_LENGTH = 512
20
  MODEL_NAME = "microsoft/deberta-v3-small"
21
+ WINDOW_SIZE = 6
22
  WINDOW_OVERLAP = 2
23
  CONFIDENCE_THRESHOLD = 0.65
24
  BATCH_SIZE = 8 # Reduced batch size for CPU
 
176
  'num_windows': len(predictions)
177
  }
178
 
179
+ def predict_with_local_context(self, text: str) -> Dict:
180
+ """Enhanced prediction that maintains high confidence while preventing bleeding"""
181
+ if self.model is None or self.tokenizer is None:
182
+ self.load_model()
183
+
184
+ self.model.eval()
185
+ sentences = self.processor.split_into_sentences(text)
186
+ if not sentences:
187
+ return {}
188
+
189
+ # Initialize scores for each sentence
190
+ sentence_predictions = []
191
+
192
+ # First pass: Get base predictions for each sentence
193
+ for i in range(len(sentences)):
194
+ # Get a small window around the current sentence
195
+ start_idx = max(0, i - 1)
196
+ end_idx = min(len(sentences), i + 2)
197
+ window = sentences[start_idx:end_idx]
198
+
199
+ # Get model prediction for this window
200
+ inputs = self.tokenizer(
201
+ " ".join(window),
202
+ truncation=True,
203
+ padding=True,
204
+ max_length=MAX_LENGTH,
205
+ return_tensors="pt"
206
+ ).to(self.device)
207
+
208
+ with torch.no_grad():
209
+ outputs = self.model(**inputs)
210
+ probs = F.softmax(outputs.logits, dim=-1)
211
+
212
+ # Extract probabilities
213
+ human_prob = probs[0][1].item()
214
+ ai_prob = probs[0][0].item()
215
+
216
+ sentence_predictions.append({
217
+ 'sentence': sentences[i],
218
+ 'human_prob': human_prob,
219
+ 'ai_prob': ai_prob,
220
+ 'prediction': 'human' if human_prob > ai_prob else 'ai',
221
+ 'confidence': max(human_prob, ai_prob)
222
+ })
223
+
224
+ del inputs, outputs, probs
225
+ if torch.cuda.is_available():
226
+ torch.cuda.empty_cache()
227
+
228
+ # Second pass: Minimal smoothing only at significant prediction boundaries
229
+ smoothed_predictions = []
230
+ for i in range(len(sentence_predictions)):
231
+ pred = sentence_predictions[i].copy()
232
+
233
+ # Only apply smoothing if this sentence is at a prediction boundary
234
+ if i > 0 and i < len(sentence_predictions) - 1:
235
+ prev_pred = sentence_predictions[i-1]
236
+ next_pred = sentence_predictions[i+1]
237
+
238
+ # Check if we're at a prediction boundary
239
+ at_boundary = (
240
+ pred['prediction'] != prev_pred['prediction'] or
241
+ pred['prediction'] != next_pred['prediction']
242
+ )
243
+
244
+ if at_boundary:
245
+ # Calculate average confidence of neighbors
246
+ neighbor_conf = (prev_pred['confidence'] + next_pred['confidence']) / 2
247
+
248
+ # If neighbors are very confident and different from current prediction,
249
+ # slightly adjust current prediction
250
+ if neighbor_conf > 0.85 and pred['confidence'] < 0.75:
251
+ # Adjust probabilities slightly toward neighbors
252
+ weight = 0.15 # Small adjustment weight
253
+ pred['human_prob'] = (
254
+ pred['human_prob'] * (1 - weight) +
255
+ ((prev_pred['human_prob'] + next_pred['human_prob']) / 2) * weight
256
+ )
257
+ pred['ai_prob'] = (
258
+ pred['ai_prob'] * (1 - weight) +
259
+ ((prev_pred['ai_prob'] + next_pred['ai_prob']) / 2) * weight
260
+ )
261
+
262
+ # Update prediction and confidence
263
+ pred['prediction'] = 'human' if pred['human_prob'] > pred['ai_prob'] else 'ai'
264
+ pred['confidence'] = max(pred['human_prob'], pred['ai_prob'])
265
+
266
+ smoothed_predictions.append(pred)
267
+
268
+ return {
269
+ 'sentence_predictions': smoothed_predictions,
270
+ 'highlighted_text': self.format_predictions_html(smoothed_predictions),
271
+ 'full_text': text,
272
+ 'overall_prediction': self.aggregate_predictions(smoothed_predictions)
273
+ }
274
+
275
  def detailed_scan(self, text: str) -> Dict:
276
  """Perform a detailed scan with improved sentence-level analysis."""
277
  if not text.strip():
 
436
  quick_analysis
437
  )
438
  else:
439
+ analysis = classifier.predict_with_local_context(text)
440
 
441
  detailed_analysis = []
442
  for pred in analysis['sentence_predictions']: