|
|
|
import torch |
|
import time |
|
from model_loader import classifier_model |
|
from paraphraser import paraphrase_comment |
|
from metrics import compute_semantic_similarity, compute_empathy_score, compute_bias_score, compute_hallucination_score |
|
from metrics import compute_reward_scores |
|
import numpy as np |
|
|
|
def softmax(logits): |
|
exp_logits = np.exp(logits - np.max(logits)) |
|
return exp_logits / exp_logits.sum() |
|
|
|
|
|
def compute_reward_scores(original, paraphrased): |
|
""" |
|
Compute all reward scores for a paraphrase. |
|
Returns a dictionary with empathy, toxicity, bias, hallucination, and overall reward. |
|
""" |
|
try: |
|
|
|
_, _, _, toxicity_score, bias_score, _, _, _, _, paraphrased_toxicity_score, paraphrased_bias_score, _, _ = classify_toxic_comment(paraphrased) |
|
toxicity = paraphrased_toxicity_score if paraphrased_toxicity_score is not None else 0.5 |
|
|
|
|
|
empathy = compute_empathy_score(paraphrased) or 0.5 |
|
bias = compute_bias_score(paraphrased) or 0.5 |
|
hallucination = compute_hallucination_score(original, paraphrased) or 0.5 |
|
|
|
|
|
reward = (0.4 * empathy) - (0.2 * toxicity) - (0.2 * bias) - (0.2 * hallucination) |
|
reward = max(0.0, min(1.0, round(reward, 2))) |
|
|
|
return { |
|
"empathy": empathy, |
|
"toxicity": toxicity, |
|
"bias": bias, |
|
"hallucination": hallucination, |
|
"reward": reward |
|
} |
|
except Exception as e: |
|
print(f"Error computing reward scores: {str(e)}") |
|
return { |
|
"empathy": 0.5, |
|
"toxicity": 0.5, |
|
"bias": 0.5, |
|
"hallucination": 0.5, |
|
"reward": 0.5 |
|
} |
|
|
|
def classify_toxic_comment(comment): |
|
""" |
|
Classify a comment for toxicity and compute additional metrics. |
|
Returns a dictionary with classification results and scores. |
|
""" |
|
try: |
|
start_time = time.time() |
|
print("Starting classification...") |
|
|
|
|
|
inputs = classifier_model.tokenizer( |
|
comment, |
|
return_tensors="pt", |
|
truncation=True, |
|
padding=True, |
|
max_length=512 |
|
).to(classifier_model.device) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = classifier_model.model(**inputs) |
|
logits = outputs.logits.cpu().numpy()[0] |
|
probs = softmax(logits) |
|
|
|
toxicity = probs[1] |
|
print(f"Classification took {time.time() - start_time:.2f} seconds") |
|
|
|
|
|
scores = compute_reward_scores(comment, comment) |
|
scores["toxicity"] = toxicity |
|
|
|
print(f"Total processing time: {time.time() - start_time:.2f} seconds") |
|
return scores |
|
|
|
except Exception as e: |
|
print(f"Error during classification: {str(e)}") |
|
return { |
|
"empathy": 0.0, |
|
"toxicity": 1.0, |
|
"bias": 1.0, |
|
"hallucination": 1.0, |
|
"reward": 0.0 |
|
} |