ApsidalSolid4 commited on
Commit
13fd1cb
·
verified ·
1 Parent(s): 97a3e71

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -26
app.py CHANGED
@@ -51,25 +51,22 @@ class TextWindowProcessor:
51
  windows.append(" ".join(window))
52
  return windows
53
 
54
- def create_centered_windows(self, sentences: List[str], window_size: int) -> tuple[List[str], List[List[int]]]:
55
- """Create centered windows for detailed analysis mode."""
56
  windows = []
57
  window_sentence_indices = []
58
-
59
  for i in range(len(sentences)):
 
60
  half_window = window_size // 2
61
  start_idx = max(0, i - half_window)
62
  end_idx = min(len(sentences), i + half_window + 1)
63
-
64
- if start_idx == 0:
65
- end_idx = min(len(sentences), window_size)
66
- elif end_idx == len(sentences):
67
- start_idx = max(0, len(sentences) - window_size)
68
-
69
  window = sentences[start_idx:end_idx]
70
  windows.append(" ".join(window))
71
  window_sentence_indices.append(list(range(start_idx, end_idx)))
72
-
73
  return windows, window_sentence_indices
74
 
75
  class TextClassifier:
@@ -166,7 +163,7 @@ class TextClassifier:
166
  }
167
 
168
  def detailed_scan(self, text: str) -> Dict:
169
- """Perform a detailed scan with sentence-level analysis."""
170
  if not text.strip():
171
  return {
172
  'sentence_predictions': [],
@@ -178,23 +175,23 @@ class TextClassifier:
178
  'num_sentences': 0
179
  }
180
  }
181
-
182
  sentences = self.processor.split_into_sentences(text)
183
  if not sentences:
184
  return {}
185
-
186
  # Create centered windows for each sentence
187
  windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
188
-
189
  # Track scores for each sentence
190
  sentence_appearances = {i: 0 for i in range(len(sentences))}
191
  sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
192
-
193
  # Process windows in batches
194
  for i in range(0, len(windows), BATCH_SIZE):
195
  batch_windows = windows[i:i + BATCH_SIZE]
196
- batch_indices = window_sentence_indices[i:i + BATCH_SIZE]
197
-
198
  inputs = self.tokenizer(
199
  batch_windows,
200
  truncation=True,
@@ -202,23 +199,51 @@ class TextClassifier:
202
  max_length=MAX_LENGTH,
203
  return_tensors="pt"
204
  ).to(self.device)
205
-
206
  with torch.no_grad():
207
  outputs = self.model(**inputs)
208
  probs = F.softmax(outputs.logits, dim=-1)
209
-
 
210
  for window_idx, indices in enumerate(batch_indices):
211
- for sent_idx in indices:
212
- sentence_appearances[sent_idx] += 1
213
- sentence_scores[sent_idx]['human_prob'] += probs[window_idx][1].item()
214
- sentence_scores[sent_idx]['ai_prob'] += probs[window_idx][0].item()
215
-
216
- # Average the scores and create final sentence-level predictions
 
 
 
 
 
 
217
  sentence_predictions = []
218
  for i in range(len(sentences)):
219
  if sentence_appearances[i] > 0:
220
  human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
221
  ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  sentence_predictions.append({
223
  'sentence': sentences[i],
224
  'human_prob': human_prob,
@@ -226,7 +251,7 @@ class TextClassifier:
226
  'prediction': 'human' if human_prob > ai_prob else 'ai',
227
  'confidence': max(human_prob, ai_prob)
228
  })
229
-
230
  return {
231
  'sentence_predictions': sentence_predictions,
232
  'highlighted_text': self.format_predictions_html(sentence_predictions),
 
51
  windows.append(" ".join(window))
52
  return windows
53
 
54
+ def create_centered_windows(self, sentences: List[str], window_size: int) -> Tuple[List[str], List[List[int]]]:
55
+ """Create windows with better boundary handling"""
56
  windows = []
57
  window_sentence_indices = []
58
+
59
  for i in range(len(sentences)):
60
+ # Calculate window boundaries centered on current sentence
61
  half_window = window_size // 2
62
  start_idx = max(0, i - half_window)
63
  end_idx = min(len(sentences), i + half_window + 1)
64
+
65
+ # Create the window
 
 
 
 
66
  window = sentences[start_idx:end_idx]
67
  windows.append(" ".join(window))
68
  window_sentence_indices.append(list(range(start_idx, end_idx)))
69
+
70
  return windows, window_sentence_indices
71
 
72
  class TextClassifier:
 
163
  }
164
 
165
  def detailed_scan(self, text: str) -> Dict:
166
+ """Perform a detailed scan with sentence-level analysis and improved boundary handling."""
167
  if not text.strip():
168
  return {
169
  'sentence_predictions': [],
 
175
  'num_sentences': 0
176
  }
177
  }
178
+
179
  sentences = self.processor.split_into_sentences(text)
180
  if not sentences:
181
  return {}
182
+
183
  # Create centered windows for each sentence
184
  windows, window_sentence_indices = self.processor.create_centered_windows(sentences, WINDOW_SIZE)
185
+
186
  # Track scores for each sentence
187
  sentence_appearances = {i: 0 for i in range(len(sentences))}
188
  sentence_scores = {i: {'human_prob': 0.0, 'ai_prob': 0.0} for i in range(len(sentences))}
189
+
190
  # Process windows in batches
191
  for i in range(0, len(windows), BATCH_SIZE):
192
  batch_windows = windows[i:i + BATCH_SIZE]
193
+ batch_indices = window_sentence_indices[i:i + batch_size]
194
+
195
  inputs = self.tokenizer(
196
  batch_windows,
197
  truncation=True,
 
199
  max_length=MAX_LENGTH,
200
  return_tensors="pt"
201
  ).to(self.device)
202
+
203
  with torch.no_grad():
204
  outputs = self.model(**inputs)
205
  probs = F.softmax(outputs.logits, dim=-1)
206
+
207
+ # Attribute predictions with center-weighted approach
208
  for window_idx, indices in enumerate(batch_indices):
209
+ center_idx = len(indices) // 2
210
+ center_weight = 0.7 # Higher weight for center sentence
211
+ edge_weight = 0.3 / (len(indices) - 1) # Distribute remaining weight
212
+
213
+ for pos, sent_idx in enumerate(indices):
214
+ # Apply higher weight to center sentence
215
+ weight = center_weight if pos == center_idx else edge_weight
216
+ sentence_appearances[sent_idx] += weight
217
+ sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
218
+ sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
219
+
220
+ # Calculate final predictions with boundary smoothing
221
  sentence_predictions = []
222
  for i in range(len(sentences)):
223
  if sentence_appearances[i] > 0:
224
  human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
225
  ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
226
+
227
+ # Apply minimal smoothing at prediction boundaries
228
+ if i > 0 and i < len(sentences) - 1:
229
+ prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
230
+ prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
231
+ next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
232
+ next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
233
+
234
+ # Check if we're at a prediction boundary
235
+ current_pred = 'human' if human_prob > ai_prob else 'ai'
236
+ prev_pred = 'human' if prev_human > prev_ai else 'ai'
237
+ next_pred = 'human' if next_human > next_ai else 'ai'
238
+
239
+ if current_pred != prev_pred or current_pred != next_pred:
240
+ # Small adjustment at boundaries
241
+ smooth_factor = 0.1
242
+ human_prob = (human_prob * (1 - smooth_factor) +
243
+ (prev_human + next_human) * smooth_factor / 2)
244
+ ai_prob = (ai_prob * (1 - smooth_factor) +
245
+ (prev_ai + next_ai) * smooth_factor / 2)
246
+
247
  sentence_predictions.append({
248
  'sentence': sentences[i],
249
  'human_prob': human_prob,
 
251
  'prediction': 'human' if human_prob > ai_prob else 'ai',
252
  'confidence': max(human_prob, ai_prob)
253
  })
254
+
255
  return {
256
  'sentence_predictions': sentence_predictions,
257
  'highlighted_text': self.format_predictions_html(sentence_predictions),