JanviMl's picture
Update classifier.py
1e828e9 verified
# classifier.py
import torch
import time
from model_loader import classifier_model
from paraphraser import paraphrase_comment
from metrics import compute_semantic_similarity, compute_empathy_score, compute_bias_score, compute_hallucination_score
from metrics import compute_reward_scores
import numpy as np
def softmax(logits):
exp_logits = np.exp(logits - np.max(logits))
return exp_logits / exp_logits.sum()
def compute_reward_scores(original, paraphrased):
"""
Compute all reward scores for a paraphrase.
Returns a dictionary with empathy, toxicity, bias, hallucination, and overall reward.
"""
try:
# Get toxicity from classifier
_, _, _, toxicity_score, bias_score, _, _, _, _, paraphrased_toxicity_score, paraphrased_bias_score, _, _ = classify_toxic_comment(paraphrased)
toxicity = paraphrased_toxicity_score if paraphrased_toxicity_score is not None else 0.5
# Compute other metrics
empathy = compute_empathy_score(paraphrased) or 0.5
bias = compute_bias_score(paraphrased) or 0.5
hallucination = compute_hallucination_score(original, paraphrased) or 0.5
# Overall reward: Weighted combination (adjust weights as needed)
reward = (0.4 * empathy) - (0.2 * toxicity) - (0.2 * bias) - (0.2 * hallucination)
reward = max(0.0, min(1.0, round(reward, 2)))
return {
"empathy": empathy,
"toxicity": toxicity,
"bias": bias,
"hallucination": hallucination,
"reward": reward
}
except Exception as e:
print(f"Error computing reward scores: {str(e)}")
return {
"empathy": 0.5,
"toxicity": 0.5,
"bias": 0.5,
"hallucination": 0.5,
"reward": 0.5
}
def classify_toxic_comment(comment):
"""
Classify a comment for toxicity and compute additional metrics.
Returns a dictionary with classification results and scores.
"""
try:
start_time = time.time()
print("Starting classification...")
# Tokenize the comment
inputs = classifier_model.tokenizer(
comment,
return_tensors="pt",
truncation=True,
padding=True,
max_length=512
).to(classifier_model.device)
# Classify using the toxicity classifier
with torch.no_grad():
outputs = classifier_model.model(**inputs)
logits = outputs.logits.cpu().numpy()[0]
probs = softmax(logits)
toxicity = probs[1] # Assuming label 1 is toxic
print(f"Classification took {time.time() - start_time:.2f} seconds")
# Compute additional metrics (empathy, bias, hallucination, reward)
scores = compute_reward_scores(comment, comment) # Use comment as both original and paraphrase for classification
scores["toxicity"] = toxicity # Override toxicity with classifier result
print(f"Total processing time: {time.time() - start_time:.2f} seconds")
return scores
except Exception as e:
print(f"Error during classification: {str(e)}")
return {
"empathy": 0.0,
"toxicity": 1.0,
"bias": 1.0,
"hallucination": 1.0,
"reward": 0.0
}