File size: 5,399 Bytes
77d857b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import pandas as pd
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import warnings
warnings.filterwarnings("ignore")

# Load the human evaluation dataset
df = pd.read_excel("final_comments_evaluations_latest.xlsx")

# Initialize the Granite 3.2-2B-Instruct model and tokenizer (from your existing setup)
model_name = "ibm-granite/granite-3.2-2b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Define a simple reward model (mockup based on dataset metrics)
# In practice, this would be the trained reward model from Stage 3
def reward_model(paraphrase, original_scores):
    # Mock reward calculation: adjust scores based on trends in the dataset
    base_toxicity = original_scores["toxicity"]
    base_empathy = original_scores["empathy"]
    # Simulate improved paraphrasing: reduce toxicity, increase empathy
    new_toxicity = max(0.1, base_toxicity - 0.2)  # Reduce toxicity
    new_empathy = min(0.9, base_empathy + 0.1)   # Increase empathy
    new_bias = original_scores["bias"]
    new_hallucination = max(0.1, original_scores["hallucination"] - 0.1)
    # Composite reward score (weights based on dataset analysis)
    reward = 0.4 * new_empathy - 0.3 * new_toxicity - 0.2 * new_bias - 0.1 * new_hallucination
    return reward, {"toxicity": new_toxicity, "empathy": new_empathy, "bias": new_bias, "hallucination": new_hallucination}

# Function to generate a paraphrase using your existing paraphrasing logic
def generate_paraphrase(comment, max_length=128):
    prompt = (
        "You are a content moderator tasked with rewriting toxic comments into neutral and constructive ones while maintaining the original meaning. "
        "Follow these guidelines:\n"
        "- Remove explicit hate speech, personal attacks, or offensive language.\n"
        "- Keep the response neutral and professional.\n"
        "- Ensure the rewritten comment retains the original intent but in a constructive tone.\n"
        "- Match the length and brevity of the original toxic comment whenever possible. Keep the response short and to the point.\n\n"
        "Examples:\n"
        "Toxic: \"You're so dumb! You never understand anything!\"\n"
        "Neutral: \"You might be misunderstanding this.\"\n"
        "Toxic: \"This is the worst idea ever. Only an idiot would suggest this.\"\n"
        "Neutral: \"I don’t think this idea works well.\"\n"
        "Toxic: \"You’re useless.\"\n"
        "Neutral: \"This isn’t helping much.\"\n"
        "Toxic: \"Shut up.\"\n"
        "Neutral: \"Let’s take a break from this.\"\n\n"
        f"Now, rewrite this comment: \"{comment}\""
    )
    inputs = tokenizer(prompt, return_tensors="pt", max_length=max_length, truncation=True).to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,
        num_beams=4,
        early_stopping=True,
        do_sample=False
    )
    paraphrase = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Clean up the output by removing the prompt part
    paraphrase = paraphrase.replace(prompt, "").strip()
    if paraphrase.startswith("Neutral: "):
        paraphrase = paraphrase[len("Neutral: "):].strip()
    return paraphrase

# RLHF Loop
max_iterations = 5
reward_threshold = 0.2  # Target for acceptable paraphrases (based on dataset range -0.25 to 0.24)
results = []

for idx, row in df.iterrows():
    original_comment = row["Comment"]
    current_paraphrase = row["Paraphrase_Comment"]
    current_reward = row["reward_score"]
    current_scores = {
        "toxicity": row["toxicity"],
        "empathy": row["empathy"],
        "bias": row["bias"],
        "hallucination": row["hallucination"]
    }
    
    best_paraphrase = current_paraphrase
    best_reward = current_reward
    best_scores = current_scores.copy()
    
    # Iteratively refine the paraphrase
    for iteration in range(max_iterations):
        # Generate a new paraphrase
        new_paraphrase = generate_paraphrase(original_comment)
        # Evaluate the new paraphrase with the reward model
        new_reward, new_scores = reward_model(new_paraphrase, current_scores)
        
        # If the new reward is better, update the best paraphrase
        if new_reward > best_reward:
            best_paraphrase = new_paraphrase
            best_reward = new_reward
            best_scores = new_scores
        
        # Stop if the reward exceeds the threshold
        if best_reward >= reward_threshold:
            break
    
    # Store the result
    results.append({
        "Comment": original_comment,
        "Original_Paraphrase": current_paraphrase,
        "Refined_Paraphrase": best_paraphrase,
        "Original_Reward_Score": current_reward,
        "Refined_Reward_Score": best_reward,
        "Refined_Empathy": best_scores["empathy"],
        "Refined_Toxicity": best_scores["toxicity"],
        "Refined_Bias": best_scores["bias"],
        "Refined_Hallucination": best_scores["hallucination"],
        "Human_Evaluation_Reasoning": row["Human_Evaluation_Reasoning"]
    })

# Save the results to a CSV file
results_df = pd.DataFrame(results)
results_df.to_csv("refined_paraphrases.csv", index=False)
print("Refinement complete. Results saved to refined_paraphrases.csv")