ApsidalSolid4 commited on
Commit
25f2b88
·
verified ·
1 Parent(s): 1bb7d9d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -28
app.py CHANGED
@@ -3,11 +3,14 @@ import numpy as np
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
  import torch.nn.functional as F
5
  import spacy
6
- from typing import List, Dict
7
  import logging
8
  import os
9
  import gradio as gr
10
  from fastapi.middleware.cors import CORSMiddleware
 
 
 
11
  # Configure logging
12
  logging.basicConfig(level=logging.INFO)
13
  logger = logging.getLogger(__name__)
@@ -18,7 +21,8 @@ MODEL_NAME = "microsoft/deberta-v3-small"
18
  WINDOW_SIZE = 17
19
  WINDOW_OVERLAP = 2
20
  CONFIDENCE_THRESHOLD = 0.65
21
- BATCH_SIZE = 16
 
22
 
23
  class TextWindowProcessor:
24
  def __init__(self):
@@ -34,13 +38,15 @@ class TextWindowProcessor:
34
 
35
  disabled_pipes = [pipe for pipe in self.nlp.pipe_names if pipe != 'sentencizer']
36
  self.nlp.disable_pipes(*disabled_pipes)
 
 
 
37
 
38
  def split_into_sentences(self, text: str) -> List[str]:
39
  doc = self.nlp(text)
40
  return [str(sent).strip() for sent in doc.sents]
41
 
42
  def create_windows(self, sentences: List[str], window_size: int, overlap: int) -> List[str]:
43
- """Create overlapping windows for quick scan mode."""
44
  if len(sentences) < window_size:
45
  return [" ".join(sentences)]
46
 
@@ -51,21 +57,18 @@ class TextWindowProcessor:
51
  windows.append(" ".join(window))
52
  return windows
53
 
54
- def create_centered_windows(self, sentences: List[str], window_size: int) -> tuple[List[str], List[List[int]]]:
55
- """Create centered windows for detailed analysis mode."""
56
  windows = []
57
  window_sentence_indices = []
58
 
59
  for i in range(len(sentences)):
 
60
  half_window = window_size // 2
61
  start_idx = max(0, i - half_window)
62
  end_idx = min(len(sentences), i + half_window + 1)
63
 
64
- if start_idx == 0:
65
- end_idx = min(len(sentences), window_size)
66
- elif end_idx == len(sentences):
67
- start_idx = max(0, len(sentences) - window_size)
68
-
69
  window = sentences[start_idx:end_idx]
70
  windows.append(" ".join(window))
71
  window_sentence_indices.append(list(range(start_idx, end_idx)))
@@ -75,12 +78,17 @@ class TextWindowProcessor:
75
  class TextClassifier:
76
  def __init__(self):
77
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
 
 
 
78
  self.model_name = MODEL_NAME
79
  self.tokenizer = None
80
  self.model = None
81
  self.processor = TextWindowProcessor()
82
  self.initialize_model()
83
-
84
  def initialize_model(self):
85
  """Initialize the model and tokenizer."""
86
  logger.info("Initializing model and tokenizer...")
@@ -90,15 +98,19 @@ class TextClassifier:
90
  self.tokenizer = DebertaV2TokenizerFast.from_pretrained(
91
  self.model_name,
92
  model_max_length=MAX_LENGTH,
93
- use_fast=False,
94
- from_slow=True
95
  )
96
 
97
  self.model = AutoModelForSequenceClassification.from_pretrained(
98
  self.model_name,
99
- num_labels=2
 
100
  ).to(self.device)
101
 
 
 
 
 
102
  model_path = "model_20250209_184929_acc1.0000.pt"
103
  if os.path.exists(model_path):
104
  logger.info(f"Loading custom model from {model_path}")
@@ -123,7 +135,7 @@ class TextClassifier:
123
 
124
  predictions = []
125
 
126
- # Process windows in batches
127
  for i in range(0, len(windows), BATCH_SIZE):
128
  batch_windows = windows[i:i + BATCH_SIZE]
129
 
@@ -148,7 +160,11 @@ class TextClassifier:
148
  }
149
  predictions.append(prediction)
150
 
151
- # Calculate aggregate prediction
 
 
 
 
152
  if not predictions:
153
  return {
154
  'prediction': 'unknown',
@@ -166,7 +182,7 @@ class TextClassifier:
166
  }
167
 
168
  def detailed_scan(self, text: str) -> Dict:
169
- """Perform a detailed scan with sentence-level analysis."""
170
  if not text.strip():
171
  return {
172
  'sentence_predictions': [],
@@ -207,18 +223,51 @@ class TextClassifier:
207
  outputs = self.model(**inputs)
208
  probs = F.softmax(outputs.logits, dim=-1)
209
 
 
210
  for window_idx, indices in enumerate(batch_indices):
211
- for sent_idx in indices:
212
- sentence_appearances[sent_idx] += 1
213
- sentence_scores[sent_idx]['human_prob'] += probs[window_idx][1].item()
214
- sentence_scores[sent_idx]['ai_prob'] += probs[window_idx][0].item()
215
-
216
- # Average the scores and create final sentence-level predictions
 
 
 
 
 
 
 
 
 
 
 
217
  sentence_predictions = []
218
  for i in range(len(sentences)):
219
  if sentence_appearances[i] > 0:
220
  human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
221
  ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  sentence_predictions.append({
223
  'sentence': sentences[i],
224
  'human_prob': human_prob,
@@ -282,7 +331,6 @@ class TextClassifier:
282
  def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
283
  """Analyze text using specified mode and return formatted results."""
284
  if mode == "quick":
285
- # Quick scan
286
  result = classifier.quick_scan(text)
287
 
288
  quick_analysis = f"""
@@ -297,10 +345,8 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
297
  quick_analysis
298
  )
299
  else:
300
- # Detailed scan
301
  analysis = classifier.detailed_scan(text)
302
 
303
- # Format sentence-by-sentence analysis
304
  detailed_analysis = []
305
  for pred in analysis['sentence_predictions']:
306
  confidence = pred['confidence'] * 100
@@ -309,7 +355,6 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
309
  detailed_analysis.append(f"Confidence: {confidence:.1f}%")
310
  detailed_analysis.append("-" * 50)
311
 
312
- # Format overall prediction
313
  final_pred = analysis['overall_prediction']
314
  overall_result = f"""
315
  FINAL PREDICTION: {final_pred['prediction'].upper()}
@@ -354,7 +399,7 @@ demo = gr.Interface(
354
  ["This is a sample text written by a human. It contains multiple sentences with different ideas. The analysis will show how each sentence is classified.", "detailed"],
355
  ],
356
  api_name="predict",
357
- flagging_mode="never" # Updated from allow_flagging
358
  )
359
 
360
  app = demo.app
 
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
  import torch.nn.functional as F
5
  import spacy
6
+ from typing import List, Dict, Tuple
7
  import logging
8
  import os
9
  import gradio as gr
10
  from fastapi.middleware.cors import CORSMiddleware
11
+ from concurrent.futures import ThreadPoolExecutor
12
+ from functools import partial
13
+
14
  # Configure logging
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
 
21
  WINDOW_SIZE = 17
22
  WINDOW_OVERLAP = 2
23
  CONFIDENCE_THRESHOLD = 0.65
24
+ BATCH_SIZE = 8 # Reduced batch size for CPU
25
+ MAX_WORKERS = 4 # Number of worker threads for processing
26
 
27
  class TextWindowProcessor:
28
  def __init__(self):
 
38
 
39
  disabled_pipes = [pipe for pipe in self.nlp.pipe_names if pipe != 'sentencizer']
40
  self.nlp.disable_pipes(*disabled_pipes)
41
+
42
+ # Initialize thread pool for parallel processing
43
+ self.executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
44
 
45
  def split_into_sentences(self, text: str) -> List[str]:
46
  doc = self.nlp(text)
47
  return [str(sent).strip() for sent in doc.sents]
48
 
49
  def create_windows(self, sentences: List[str], window_size: int, overlap: int) -> List[str]:
 
50
  if len(sentences) < window_size:
51
  return [" ".join(sentences)]
52
 
 
57
  windows.append(" ".join(window))
58
  return windows
59
 
60
+ def create_centered_windows(self, sentences: List[str], window_size: int) -> Tuple[List[str], List[List[int]]]:
61
+ """Create windows with better boundary handling"""
62
  windows = []
63
  window_sentence_indices = []
64
 
65
  for i in range(len(sentences)):
66
+ # Calculate window boundaries centered on current sentence
67
  half_window = window_size // 2
68
  start_idx = max(0, i - half_window)
69
  end_idx = min(len(sentences), i + half_window + 1)
70
 
71
+ # Create the window
 
 
 
 
72
  window = sentences[start_idx:end_idx]
73
  windows.append(" ".join(window))
74
  window_sentence_indices.append(list(range(start_idx, end_idx)))
 
78
  class TextClassifier:
79
  def __init__(self):
80
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
81
+ if self.device.type == 'cpu':
82
+ # Enable CPU optimizations
83
+ torch.set_num_threads(MAX_WORKERS)
84
+ torch.set_num_interop_threads(MAX_WORKERS)
85
+
86
  self.model_name = MODEL_NAME
87
  self.tokenizer = None
88
  self.model = None
89
  self.processor = TextWindowProcessor()
90
  self.initialize_model()
91
+
92
  def initialize_model(self):
93
  """Initialize the model and tokenizer."""
94
  logger.info("Initializing model and tokenizer...")
 
98
  self.tokenizer = DebertaV2TokenizerFast.from_pretrained(
99
  self.model_name,
100
  model_max_length=MAX_LENGTH,
101
+ use_fast=True
 
102
  )
103
 
104
  self.model = AutoModelForSequenceClassification.from_pretrained(
105
  self.model_name,
106
+ num_labels=2,
107
+ torchscript=True # Enable TorchScript optimization
108
  ).to(self.device)
109
 
110
+ if self.device.type == 'cpu':
111
+ self.model.eval() # Ensure model is in eval mode for optimization
112
+ self.model = torch.jit.optimize_for_inference(torch.jit.script(self.model))
113
+
114
  model_path = "model_20250209_184929_acc1.0000.pt"
115
  if os.path.exists(model_path):
116
  logger.info(f"Loading custom model from {model_path}")
 
135
 
136
  predictions = []
137
 
138
+ # Process windows in smaller batches for CPU efficiency
139
  for i in range(0, len(windows), BATCH_SIZE):
140
  batch_windows = windows[i:i + BATCH_SIZE]
141
 
 
160
  }
161
  predictions.append(prediction)
162
 
163
+ # Clean up GPU memory if available
164
+ del inputs, outputs, probs
165
+ if torch.cuda.is_available():
166
+ torch.cuda.empty_cache()
167
+
168
  if not predictions:
169
  return {
170
  'prediction': 'unknown',
 
182
  }
183
 
184
  def detailed_scan(self, text: str) -> Dict:
185
+ """Perform a detailed scan with improved sentence-level analysis."""
186
  if not text.strip():
187
  return {
188
  'sentence_predictions': [],
 
223
  outputs = self.model(**inputs)
224
  probs = F.softmax(outputs.logits, dim=-1)
225
 
226
+ # Attribute predictions with weighted scoring
227
  for window_idx, indices in enumerate(batch_indices):
228
+ center_idx = len(indices) // 2
229
+ center_weight = 0.7 # Higher weight for center sentence
230
+ edge_weight = 0.3 / (len(indices) - 1) # Distribute remaining weight
231
+
232
+ for pos, sent_idx in enumerate(indices):
233
+ # Apply higher weight to center sentence
234
+ weight = center_weight if pos == center_idx else edge_weight
235
+ sentence_appearances[sent_idx] += weight
236
+ sentence_scores[sent_idx]['human_prob'] += weight * probs[window_idx][1].item()
237
+ sentence_scores[sent_idx]['ai_prob'] += weight * probs[window_idx][0].item()
238
+
239
+ # Clean up memory
240
+ del inputs, outputs, probs
241
+ if torch.cuda.is_available():
242
+ torch.cuda.empty_cache()
243
+
244
+ # Calculate final predictions with boundary smoothing
245
  sentence_predictions = []
246
  for i in range(len(sentences)):
247
  if sentence_appearances[i] > 0:
248
  human_prob = sentence_scores[i]['human_prob'] / sentence_appearances[i]
249
  ai_prob = sentence_scores[i]['ai_prob'] / sentence_appearances[i]
250
+
251
+ # Apply minimal smoothing at prediction boundaries
252
+ if i > 0 and i < len(sentences) - 1:
253
+ prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
254
+ prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
255
+ next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
256
+ next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
257
+
258
+ # Check if we're at a prediction boundary
259
+ current_pred = 'human' if human_prob > ai_prob else 'ai'
260
+ prev_pred = 'human' if prev_human > prev_ai else 'ai'
261
+ next_pred = 'human' if next_human > next_ai else 'ai'
262
+
263
+ if current_pred != prev_pred or current_pred != next_pred:
264
+ # Small adjustment at boundaries
265
+ smooth_factor = 0.1
266
+ human_prob = (human_prob * (1 - smooth_factor) +
267
+ (prev_human + next_human) * smooth_factor / 2)
268
+ ai_prob = (ai_prob * (1 - smooth_factor) +
269
+ (prev_ai + next_ai) * smooth_factor / 2)
270
+
271
  sentence_predictions.append({
272
  'sentence': sentences[i],
273
  'human_prob': human_prob,
 
331
  def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
332
  """Analyze text using specified mode and return formatted results."""
333
  if mode == "quick":
 
334
  result = classifier.quick_scan(text)
335
 
336
  quick_analysis = f"""
 
345
  quick_analysis
346
  )
347
  else:
 
348
  analysis = classifier.detailed_scan(text)
349
 
 
350
  detailed_analysis = []
351
  for pred in analysis['sentence_predictions']:
352
  confidence = pred['confidence'] * 100
 
355
  detailed_analysis.append(f"Confidence: {confidence:.1f}%")
356
  detailed_analysis.append("-" * 50)
357
 
 
358
  final_pred = analysis['overall_prediction']
359
  overall_result = f"""
360
  FINAL PREDICTION: {final_pred['prediction'].upper()}
 
399
  ["This is a sample text written by a human. It contains multiple sentences with different ideas. The analysis will show how each sentence is classified.", "detailed"],
400
  ],
401
  api_name="predict",
402
+ flagging_mode="never"
403
  )
404
 
405
  app = demo.app