JanviMl commited on
Commit
1e828e9
·
verified ·
1 Parent(s): 7351455

Update classifier.py

Browse files
Files changed (1) hide show
  1. classifier.py +42 -79
classifier.py CHANGED
@@ -4,7 +4,14 @@ import time
4
  from model_loader import classifier_model
5
  from paraphraser import paraphrase_comment
6
  from metrics import compute_semantic_similarity, compute_empathy_score, compute_bias_score, compute_hallucination_score
 
 
7
 
 
 
 
 
 
8
  def compute_reward_scores(original, paraphrased):
9
  """
10
  Compute all reward scores for a paraphrase.
@@ -43,88 +50,44 @@ def compute_reward_scores(original, paraphrased):
43
 
44
  def classify_toxic_comment(comment):
45
  """
46
- Classify a comment as toxic or non-toxic using the fine-tuned XLM-RoBERTa model.
47
- If toxic, paraphrase the comment, re-evaluate, and compute essential metrics.
48
- Returns the prediction label, confidence, color, toxicity score, bias score, paraphrased comment (if applicable), and its metrics.
49
  """
50
- start_total = time.time()
51
- print("Starting classification...")
52
-
53
- if not comment.strip():
54
- return "Error: Please enter a comment.", None, None, None, None, None, None, None, None, None, None, None, None
55
-
56
- # Access the model and tokenizer
57
- model = classifier_model.model
58
- tokenizer = classifier_model.tokenizer
59
-
60
- # Tokenize the input comment
61
- start_classification = time.time()
62
- inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True, max_length=512)
63
-
64
- # Run inference
65
- with torch.no_grad():
66
- outputs = model(**inputs)
67
- logits = outputs.logits
68
-
69
- # Get the predicted class (0 = non-toxic, 1 = toxic)
70
- predicted_class = torch.argmax(logits, dim=1).item()
71
- label = "Toxic" if predicted_class == 1 else "Non-Toxic"
72
- confidence = torch.softmax(logits, dim=1)[0][predicted_class].item()
73
- label_color = "red" if label == "Toxic" else "green"
74
-
75
- # Compute Toxicity Score (approximated as the probability of the toxic class)
76
- toxicity_score = torch.softmax(logits, dim=1)[0][1].item()
77
- toxicity_score = round(toxicity_score, 2)
78
-
79
- # Simulate Bias Score (placeholder)
80
- bias_score = 0.01 if label == "Non-Toxic" else 0.15
81
- bias_score = round(bias_score, 2)
82
- print(f"Classification took {time.time() - start_classification:.2f} seconds")
83
-
84
- # If the comment is toxic, paraphrase it and compute essential metrics
85
- paraphrased_comment = None
86
- paraphrased_prediction = None
87
- paraphrased_confidence = None
88
- paraphrased_color = None
89
- paraphrased_toxicity_score = None
90
- paraphrased_bias_score = None
91
- semantic_similarity = None
92
- empathy_score = None
93
-
94
- if label == "Toxic":
95
- # Paraphrase the comment
96
- start_paraphrase = time.time()
97
- paraphrased_comment = paraphrase_comment(comment)
98
- print(f"Paraphrasing took {time.time() - start_paraphrase:.2f} seconds")
99
-
100
- # Re-evaluate the paraphrased comment
101
- start_reclassification = time.time()
102
- paraphrased_inputs = tokenizer(paraphrased_comment, return_tensors="pt", truncation=True, padding=True, max_length=512)
103
  with torch.no_grad():
104
- paraphrased_outputs = model(**paraphrased_inputs)
105
- paraphrased_logits = paraphrased_outputs.logits
 
106
 
107
- paraphrased_predicted_class = torch.argmax(paraphrased_logits, dim=1).item()
108
- paraphrased_label = "Toxic" if paraphrased_predicted_class == 1 else "Non-Toxic"
109
- paraphrased_confidence = torch.softmax(paraphrased_logits, dim=1)[0][paraphrased_predicted_class].item()
110
- paraphrased_color = "red" if paraphrased_label == "Toxic" else "green"
111
- paraphrased_toxicity_score = torch.softmax(paraphrased_logits, dim=1)[0][1].item()
112
- paraphrased_toxicity_score = round(paraphrased_toxicity_score, 2)
113
- paraphrased_bias_score = 0.01 if paraphrased_label == "Non-Toxic" else 0.15 # Placeholder
114
- paraphrased_bias_score = round(paraphrased_bias_score, 2)
115
- print(f"Reclassification of paraphrased comment took {time.time() - start_reclassification:.2f} seconds")
116
 
117
- # Compute essential metrics
118
- start_metrics = time.time()
119
- semantic_similarity = compute_semantic_similarity(comment, paraphrased_comment)
120
- empathy_score = compute_empathy_score(paraphrased_comment)
121
- print(f"Metrics computation took {time.time() - start_metrics:.2f} seconds")
122
 
123
- print(f"Total processing time: {time.time() - start_total:.2f} seconds")
 
124
 
125
- return (
126
- f"Prediction: {label}", confidence, label_color, toxicity_score, bias_score,
127
- paraphrased_comment, f"Prediction: {paraphrased_label}" if paraphrased_comment else None,
128
- paraphrased_confidence, paraphrased_color, paraphrased_toxicity_score, paraphrased_bias_score,
129
- semantic_similarity, empathy_score
130
- )
 
 
 
 
4
  from model_loader import classifier_model
5
  from paraphraser import paraphrase_comment
6
  from metrics import compute_semantic_similarity, compute_empathy_score, compute_bias_score, compute_hallucination_score
7
+ from metrics import compute_reward_scores
8
+ import numpy as np
9
 
10
+ def softmax(logits):
11
+ exp_logits = np.exp(logits - np.max(logits))
12
+ return exp_logits / exp_logits.sum()
13
+
14
+
15
  def compute_reward_scores(original, paraphrased):
16
  """
17
  Compute all reward scores for a paraphrase.
 
50
 
51
  def classify_toxic_comment(comment):
52
  """
53
+ Classify a comment for toxicity and compute additional metrics.
54
+ Returns a dictionary with classification results and scores.
 
55
  """
56
+ try:
57
+ start_time = time.time()
58
+ print("Starting classification...")
59
+
60
+ # Tokenize the comment
61
+ inputs = classifier_model.tokenizer(
62
+ comment,
63
+ return_tensors="pt",
64
+ truncation=True,
65
+ padding=True,
66
+ max_length=512
67
+ ).to(classifier_model.device)
68
+
69
+ # Classify using the toxicity classifier
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  with torch.no_grad():
71
+ outputs = classifier_model.model(**inputs)
72
+ logits = outputs.logits.cpu().numpy()[0]
73
+ probs = softmax(logits)
74
 
75
+ toxicity = probs[1] # Assuming label 1 is toxic
76
+ print(f"Classification took {time.time() - start_time:.2f} seconds")
 
 
 
 
 
 
 
77
 
78
+ # Compute additional metrics (empathy, bias, hallucination, reward)
79
+ scores = compute_reward_scores(comment, comment) # Use comment as both original and paraphrase for classification
80
+ scores["toxicity"] = toxicity # Override toxicity with classifier result
 
 
81
 
82
+ print(f"Total processing time: {time.time() - start_time:.2f} seconds")
83
+ return scores
84
 
85
+ except Exception as e:
86
+ print(f"Error during classification: {str(e)}")
87
+ return {
88
+ "empathy": 0.0,
89
+ "toxicity": 1.0,
90
+ "bias": 1.0,
91
+ "hallucination": 1.0,
92
+ "reward": 0.0
93
+ }