# metrics.py from model_loader import classifier_model, metrics_models import torch import numpy as np import time def softmax(logits): exp_logits = np.exp(logits - np.max(logits)) return exp_logits / exp_logits.sum() def compute_semantic_similarity(original, paraphrase): """ Compute semantic similarity between original and paraphrased text using Sentence-BERT. Returns a similarity score between 0 and 1. """ try: if not isinstance(original, str) or not isinstance(paraphrase, str): print(f"Invalid input for semantic similarity: original={original}, paraphrase={paraphrase}") return 0.0 if "Error: Unable to generate paraphrase" in paraphrase: print(f"Invalid paraphrase: {paraphrase}. Returning similarity 0.0.") return 0.0 sentence_bert = metrics_models.sentence_bert embeddings = sentence_bert.encode([original, paraphrase], convert_to_tensor=True) similarity = torch.cosine_similarity(embeddings[0], embeddings[1], dim=0).item() print(f"Semantic similarity computed: {similarity}") return similarity except Exception as e: print(f"Error computing semantic similarity: {str(e)}") return 0.0 def compute_empathy_score(toxicity): """ Placeholder for empathy score computation. For now, inversely proportional to toxicity. """ return 1.0 - toxicity def compute_bias_score(toxicity): """ Placeholder for bias score computation. For now, equal to toxicity. """ return toxicity def compute_hallucination_score(similarity): """ Compute hallucination score based on semantic similarity. High difference means potential hallucination. """ return 1.0 - similarity def compute_reward_scores(original, paraphrase): """ Compute reward scores for a paraphrased comment. Returns a dictionary with empathy, toxicity, bias, hallucination, and reward scores. """ try: start_time = time.time() print("Starting reward computation...") # Check if paraphrase is valid if not isinstance(paraphrase, str) or "Error: Unable to generate paraphrase" in paraphrase: print(f"Invalid paraphrase: {paraphrase}. Returning default scores.") return { "empathy": 0.0, "toxicity": 1.0, "bias": 1.0, "hallucination": 1.0, "reward": 0.0 } # Classify the paraphrased comment print("Starting classification...") inputs = classifier_model.tokenizer( paraphrase, return_tensors="pt", truncation=True, padding=True, max_length=512 ).to(classifier_model.device) with torch.no_grad(): outputs = classifier_model.model(**inputs) logits = outputs.logits.cpu().numpy()[0] probs = softmax(logits) toxicity = probs[1] # Assuming label 1 is toxic empathy = compute_empathy_score(toxicity) bias = compute_bias_score(toxicity) print(f"Classification took {time.time() - start_time:.2f} seconds") # Compute semantic similarity using Sentence-BERT print("Computing semantic similarity...") similarity = compute_semantic_similarity(original, paraphrase) hallucination = compute_hallucination_score(similarity) # Compute reward score (weighted combination) reward = 0.4 * empathy - 0.2 * toxicity - 0.2 * bias - 0.2 * hallucination reward = max(0.0, min(1.0, reward)) print(f"Total processing time: {time.time() - start_time:.2f} seconds") return { "empathy": empathy, "toxicity": toxicity, "bias": bias, "hallucination": hallucination, "reward": reward } except Exception as e: print(f"Error in reward computation: {str(e)}") return { "empathy": 0.0, "toxicity": 1.0, "bias": 1.0, "hallucination": 1.0, "reward": 0.0 }