File size: 3,342 Bytes
ca75f71 e0084d4 1e828e9 e0084d4 1e828e9 e0084d4 bd229ab ca75f71 bd229ab 1e828e9 bd229ab 1e828e9 ca75f71 1e828e9 ca75f71 1e828e9 ca75f71 1e828e9 ca75f71 1e828e9 ca75f71 1e828e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
# classifier.py
import torch
import time
from model_loader import classifier_model
from paraphraser import paraphrase_comment
from metrics import compute_semantic_similarity, compute_empathy_score, compute_bias_score, compute_hallucination_score
from metrics import compute_reward_scores
import numpy as np
def softmax(logits):
exp_logits = np.exp(logits - np.max(logits))
return exp_logits / exp_logits.sum()
def compute_reward_scores(original, paraphrased):
"""
Compute all reward scores for a paraphrase.
Returns a dictionary with empathy, toxicity, bias, hallucination, and overall reward.
"""
try:
# Get toxicity from classifier
_, _, _, toxicity_score, bias_score, _, _, _, _, paraphrased_toxicity_score, paraphrased_bias_score, _, _ = classify_toxic_comment(paraphrased)
toxicity = paraphrased_toxicity_score if paraphrased_toxicity_score is not None else 0.5
# Compute other metrics
empathy = compute_empathy_score(paraphrased) or 0.5
bias = compute_bias_score(paraphrased) or 0.5
hallucination = compute_hallucination_score(original, paraphrased) or 0.5
# Overall reward: Weighted combination (adjust weights as needed)
reward = (0.4 * empathy) - (0.2 * toxicity) - (0.2 * bias) - (0.2 * hallucination)
reward = max(0.0, min(1.0, round(reward, 2)))
return {
"empathy": empathy,
"toxicity": toxicity,
"bias": bias,
"hallucination": hallucination,
"reward": reward
}
except Exception as e:
print(f"Error computing reward scores: {str(e)}")
return {
"empathy": 0.5,
"toxicity": 0.5,
"bias": 0.5,
"hallucination": 0.5,
"reward": 0.5
}
def classify_toxic_comment(comment):
"""
Classify a comment for toxicity and compute additional metrics.
Returns a dictionary with classification results and scores.
"""
try:
start_time = time.time()
print("Starting classification...")
# Tokenize the comment
inputs = classifier_model.tokenizer(
comment,
return_tensors="pt",
truncation=True,
padding=True,
max_length=512
).to(classifier_model.device)
# Classify using the toxicity classifier
with torch.no_grad():
outputs = classifier_model.model(**inputs)
logits = outputs.logits.cpu().numpy()[0]
probs = softmax(logits)
toxicity = probs[1] # Assuming label 1 is toxic
print(f"Classification took {time.time() - start_time:.2f} seconds")
# Compute additional metrics (empathy, bias, hallucination, reward)
scores = compute_reward_scores(comment, comment) # Use comment as both original and paraphrase for classification
scores["toxicity"] = toxicity # Override toxicity with classifier result
print(f"Total processing time: {time.time() - start_time:.2f} seconds")
return scores
except Exception as e:
print(f"Error during classification: {str(e)}")
return {
"empathy": 0.0,
"toxicity": 1.0,
"bias": 1.0,
"hallucination": 1.0,
"reward": 0.0
} |