File size: 4,558 Bytes
bd229ab
 
ab6d0c4
59e622f
106e766
bd229ab
 
 
 
c91906e
59e622f
bd229ab
 
106e766
 
 
 
 
bd229ab
 
106e766
bd229ab
 
 
106e766
bd229ab
 
 
 
 
 
 
 
59e622f
 
d6b5249
 
59e622f
 
d6b5249
 
c91906e
59e622f
 
 
 
 
 
c91906e
 
 
 
 
106e766
 
 
59e622f
 
 
 
 
 
106e766
59e622f
106e766
59e622f
 
 
 
 
 
 
 
 
 
 
c91906e
 
 
 
106e766
 
 
c91906e
59e622f
 
 
c91906e
 
106e766
59e622f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# classifier.py
import torch
from model_loader import classifier_model  # Updated import
from paraphraser import paraphrase_comment
from metrics import compute_semantic_similarity, compute_emotion_shift, compute_empathy_score, compute_bleu_score, compute_rouge_score, compute_entailment_score

def classify_toxic_comment(comment):
    """
    Classify a comment as toxic or non-toxic using the fine-tuned XLM-RoBERTa model.
    If toxic, paraphrase the comment, re-evaluate, and compute additional Stage 3 metrics.
    Returns the prediction label, confidence, color, toxicity score, bias score, paraphrased comment (if applicable), and its metrics.
    """
    if not comment.strip():
        return "Error: Please enter a comment.", None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None

    # Access the model and tokenizer
    model = classifier_model.model
    tokenizer = classifier_model.tokenizer

    # Tokenize the input comment
    inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Run inference
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Get the predicted class (0 = non-toxic, 1 = toxic)
    predicted_class = torch.argmax(logits, dim=1).item()
    label = "Toxic" if predicted_class == 1 else "Non-Toxic"
    confidence = torch.softmax(logits, dim=1)[0][predicted_class].item()
    label_color = "red" if label == "Toxic" else "green"

    # Compute Toxicity Score (approximated as the probability of the toxic class)
    toxicity_score = torch.softmax(logits, dim=1)[0][1].item()
    toxicity_score = round(toxicity_score, 2)

    # Simulate Bias Score (placeholder)
    bias_score = 0.01 if label == "Non-Toxic" else 0.15
    bias_score = round(bias_score, 2)

    # If the comment is toxic, paraphrase it and compute additional metrics
    paraphrased_comment = None
    paraphrased_prediction = None
    paraphrased_confidence = None
    paraphrased_color = None
    paraphrased_toxicity_score = None
    paraphrased_bias_score = None
    semantic_similarity = None
    original_emotion = None
    paraphrased_emotion = None
    emotion_shift_positive = None
    empathy_score = None
    bleu_score = None
    rouge_scores = None
    entailment_score = None

    if label == "Toxic":
        # Paraphrase the comment
        paraphrased_comment = paraphrase_comment(comment)

        # Re-evaluate the paraphrased comment
        paraphrased_inputs = tokenizer(paraphrased_comment, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            paraphrased_outputs = model(**paraphrased_inputs)
            paraphrased_logits = paraphrased_outputs.logits

        paraphrased_predicted_class = torch.argmax(paraphrased_logits, dim=1).item()
        paraphrased_label = "Toxic" if paraphrased_predicted_class == 1 else "Non-Toxic"
        paraphrased_confidence = torch.softmax(paraphrased_logits, dim=1)[0][paraphrased_predicted_class].item()
        paraphrased_color = "red" if paraphrased_label == "Toxic" else "green"
        paraphrased_toxicity_score = torch.softmax(paraphrased_logits, dim=1)[0][1].item()
        paraphrased_toxicity_score = round(paraphrased_toxicity_score, 2)
        paraphrased_bias_score = 0.01 if paraphrased_label == "Non-Toxic" else 0.15  # Placeholder
        paraphrased_bias_score = round(paraphrased_bias_score, 2)

        # Compute additional Stage 3 metrics
        semantic_similarity = compute_semantic_similarity(comment, paraphrased_comment)
        original_emotion, paraphrased_emotion, emotion_shift_positive = compute_emotion_shift(comment, paraphrased_comment)
        empathy_score = compute_empathy_score(paraphrased_comment)
        bleu_score = compute_bleu_score(comment, paraphrased_comment)
        rouge_scores = compute_rouge_score(comment, paraphrased_comment)
        entailment_score = compute_entailment_score(comment, paraphrased_comment)

    return (
        f"Prediction: {label}", confidence, label_color, toxicity_score, bias_score,
        paraphrased_comment, f"Prediction: {paraphrased_label}" if paraphrased_comment else None,
        paraphrased_confidence, paraphrased_color, paraphrased_toxicity_score, paraphrased_bias_score,
        semantic_similarity, f"Original: {original_emotion}, Paraphrased: {paraphrased_emotion}, Positive Shift: {emotion_shift_positive}" if original_emotion else None,
        empathy_score, bleu_score, rouge_scores, entailment_score
    )