# classifier.py import torch import time from model_loader import classifier_model from paraphraser import paraphrase_comment from metrics import compute_semantic_similarity, compute_empathy_score, compute_bias_score, compute_hallucination_score from metrics import compute_reward_scores import numpy as np def softmax(logits): exp_logits = np.exp(logits - np.max(logits)) return exp_logits / exp_logits.sum() def compute_reward_scores(original, paraphrased): """ Compute all reward scores for a paraphrase. Returns a dictionary with empathy, toxicity, bias, hallucination, and overall reward. """ try: # Get toxicity from classifier _, _, _, toxicity_score, bias_score, _, _, _, _, paraphrased_toxicity_score, paraphrased_bias_score, _, _ = classify_toxic_comment(paraphrased) toxicity = paraphrased_toxicity_score if paraphrased_toxicity_score is not None else 0.5 # Compute other metrics empathy = compute_empathy_score(paraphrased) or 0.5 bias = compute_bias_score(paraphrased) or 0.5 hallucination = compute_hallucination_score(original, paraphrased) or 0.5 # Overall reward: Weighted combination (adjust weights as needed) reward = (0.4 * empathy) - (0.2 * toxicity) - (0.2 * bias) - (0.2 * hallucination) reward = max(0.0, min(1.0, round(reward, 2))) return { "empathy": empathy, "toxicity": toxicity, "bias": bias, "hallucination": hallucination, "reward": reward } except Exception as e: print(f"Error computing reward scores: {str(e)}") return { "empathy": 0.5, "toxicity": 0.5, "bias": 0.5, "hallucination": 0.5, "reward": 0.5 } def classify_toxic_comment(comment): """ Classify a comment for toxicity and compute additional metrics. Returns a dictionary with classification results and scores. """ try: start_time = time.time() print("Starting classification...") # Tokenize the comment inputs = classifier_model.tokenizer( comment, return_tensors="pt", truncation=True, padding=True, max_length=512 ).to(classifier_model.device) # Classify using the toxicity classifier with torch.no_grad(): outputs = classifier_model.model(**inputs) logits = outputs.logits.cpu().numpy()[0] probs = softmax(logits) toxicity = probs[1] # Assuming label 1 is toxic print(f"Classification took {time.time() - start_time:.2f} seconds") # Compute additional metrics (empathy, bias, hallucination, reward) scores = compute_reward_scores(comment, comment) # Use comment as both original and paraphrase for classification scores["toxicity"] = toxicity # Override toxicity with classifier result print(f"Total processing time: {time.time() - start_time:.2f} seconds") return scores except Exception as e: print(f"Error during classification: {str(e)}") return { "empathy": 0.0, "toxicity": 1.0, "bias": 1.0, "hallucination": 1.0, "reward": 0.0 }