File size: 4,162 Bytes
829572e
5094855
 
 
 
829572e
5094855
 
 
829572e
7351455
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5094855
685f359
5094855
 
685f359
 
5094855
 
7477097
5094855
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7351455
 
5094855
 
 
 
7351455
 
5094855
 
 
 
 
 
 
 
 
 
 
 
 
7477097
 
5094855
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# metrics.py
from model_loader import classifier_model, metrics_models
import torch
import numpy as np
import time

def softmax(logits):
    exp_logits = np.exp(logits - np.max(logits))
    return exp_logits / exp_logits.sum()

def compute_semantic_similarity(original, paraphrase):
    """
    Compute semantic similarity between original and paraphrased text using Sentence-BERT.
    Returns a similarity score between 0 and 1.
    """
    try:
        if not isinstance(original, str) or not isinstance(paraphrase, str):
            print(f"Invalid input for semantic similarity: original={original}, paraphrase={paraphrase}")
            return 0.0
        if "Error: Unable to generate paraphrase" in paraphrase:
            print(f"Invalid paraphrase: {paraphrase}. Returning similarity 0.0.")
            return 0.0

        sentence_bert = metrics_models.sentence_bert
        embeddings = sentence_bert.encode([original, paraphrase], convert_to_tensor=True)
        similarity = torch.cosine_similarity(embeddings[0], embeddings[1], dim=0).item()
        print(f"Semantic similarity computed: {similarity}")
        return similarity

    except Exception as e:
        print(f"Error computing semantic similarity: {str(e)}")
        return 0.0

def compute_empathy_score(toxicity):
    """
    Placeholder for empathy score computation.
    For now, inversely proportional to toxicity.
    """
    return 1.0 - toxicity

def compute_bias_score(toxicity):
    """
    Placeholder for bias score computation.
    For now, equal to toxicity.
    """
    return toxicity

def compute_hallucination_score(similarity):
    """
    Compute hallucination score based on semantic similarity.
    High difference means potential hallucination.
    """
    return 1.0 - similarity

def compute_reward_scores(original, paraphrase):
    """
    Compute reward scores for a paraphrased comment.
    Returns a dictionary with empathy, toxicity, bias, hallucination, and reward scores.
    """
    try:
        start_time = time.time()
        print("Starting reward computation...")

        # Check if paraphrase is valid
        if not isinstance(paraphrase, str) or "Error: Unable to generate paraphrase" in paraphrase:
            print(f"Invalid paraphrase: {paraphrase}. Returning default scores.")
            return {
                "empathy": 0.0,
                "toxicity": 1.0,
                "bias": 1.0,
                "hallucination": 1.0,
                "reward": 0.0
            }

        # Classify the paraphrased comment
        print("Starting classification...")
        inputs = classifier_model.tokenizer(
            paraphrase,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        ).to(classifier_model.device)

        with torch.no_grad():
            outputs = classifier_model.model(**inputs)
            logits = outputs.logits.cpu().numpy()[0]
            probs = softmax(logits)

        toxicity = probs[1]  # Assuming label 1 is toxic
        empathy = compute_empathy_score(toxicity)
        bias = compute_bias_score(toxicity)
        print(f"Classification took {time.time() - start_time:.2f} seconds")

        # Compute semantic similarity using Sentence-BERT
        print("Computing semantic similarity...")
        similarity = compute_semantic_similarity(original, paraphrase)
        hallucination = compute_hallucination_score(similarity)

        # Compute reward score (weighted combination)
        reward = 0.4 * empathy - 0.2 * toxicity - 0.2 * bias - 0.2 * hallucination
        reward = max(0.0, min(1.0, reward))

        print(f"Total processing time: {time.time() - start_time:.2f} seconds")
        return {
            "empathy": empathy,
            "toxicity": toxicity,
            "bias": bias,
            "hallucination": hallucination,
            "reward": reward
        }

    except Exception as e:
        print(f"Error in reward computation: {str(e)}")
        return {
            "empathy": 0.0,
            "toxicity": 1.0,
            "bias": 1.0,
            "hallucination": 1.0,
            "reward": 0.0
        }