toxic-comment-classifier_rlhf

Sleeping

File size: 3,342 Bytes

# classifier.py
import torch
import time
from model_loader import classifier_model
from paraphraser import paraphrase_comment
from metrics import compute_semantic_similarity, compute_empathy_score, compute_bias_score, compute_hallucination_score
from metrics import compute_reward_scores
import numpy as np

def softmax(logits):
    exp_logits = np.exp(logits - np.max(logits))
    return exp_logits / exp_logits.sum()

    
def compute_reward_scores(original, paraphrased):
    """
    Compute all reward scores for a paraphrase.
    Returns a dictionary with empathy, toxicity, bias, hallucination, and overall reward.
    """
    try:
        # Get toxicity from classifier
        _, _, _, toxicity_score, bias_score, _, _, _, _, paraphrased_toxicity_score, paraphrased_bias_score, _, _ = classify_toxic_comment(paraphrased)
        toxicity = paraphrased_toxicity_score if paraphrased_toxicity_score is not None else 0.5

        # Compute other metrics
        empathy = compute_empathy_score(paraphrased) or 0.5
        bias = compute_bias_score(paraphrased) or 0.5
        hallucination = compute_hallucination_score(original, paraphrased) or 0.5

        # Overall reward: Weighted combination (adjust weights as needed)
        reward = (0.4 * empathy) - (0.2 * toxicity) - (0.2 * bias) - (0.2 * hallucination)
        reward = max(0.0, min(1.0, round(reward, 2)))

        return {
            "empathy": empathy,
            "toxicity": toxicity,
            "bias": bias,
            "hallucination": hallucination,
            "reward": reward
        }
    except Exception as e:
        print(f"Error computing reward scores: {str(e)}")
        return {
            "empathy": 0.5,
            "toxicity": 0.5,
            "bias": 0.5,
            "hallucination": 0.5,
            "reward": 0.5
        }

def classify_toxic_comment(comment):
    """
    Classify a comment for toxicity and compute additional metrics.
    Returns a dictionary with classification results and scores.
    """
    try:
        start_time = time.time()
        print("Starting classification...")

        # Tokenize the comment
        inputs = classifier_model.tokenizer(
            comment,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        ).to(classifier_model.device)

        # Classify using the toxicity classifier
        with torch.no_grad():
            outputs = classifier_model.model(**inputs)
            logits = outputs.logits.cpu().numpy()[0]
            probs = softmax(logits)

        toxicity = probs[1]  # Assuming label 1 is toxic
        print(f"Classification took {time.time() - start_time:.2f} seconds")

        # Compute additional metrics (empathy, bias, hallucination, reward)
        scores = compute_reward_scores(comment, comment)  # Use comment as both original and paraphrase for classification
        scores["toxicity"] = toxicity  # Override toxicity with classifier result

        print(f"Total processing time: {time.time() - start_time:.2f} seconds")
        return scores

    except Exception as e:
        print(f"Error during classification: {str(e)}")
        return {
            "empathy": 0.0,
            "toxicity": 1.0,
            "bias": 1.0,
            "hallucination": 1.0,
            "reward": 0.0
        }