File size: 5,164 Bytes
8cbcee4
77f7351
8cbcee4
 
77f7351
39459c9
8cbcee4
 
39459c9
 
77f7351
8cbcee4
 
 
 
 
 
 
 
77f7351
 
 
 
 
 
 
 
8cbcee4
77f7351
 
 
 
 
 
 
8cbcee4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77f7351
8cbcee4
 
77f7351
8cbcee4
 
77f7351
 
 
 
 
 
 
8cbcee4
 
77f7351
8cbcee4
 
 
 
 
 
 
 
 
 
 
 
 
 
77f7351
8cbcee4
 
 
39459c9
 
77f7351
39459c9
 
 
77f7351
8cbcee4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77f7351
 
8cbcee4
 
 
77f7351
39459c9
 
77f7351
39459c9
 
 
77f7351
8cbcee4
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# refine_paraphrases.py
from datasets import load_dataset
import pandas as pd
from paraphraser import paraphrase_comment
from metrics import compute_reward_scores
import os

# Configuration
DATA_PATH = "JanviMl/toxi_refined_paraphrases"
OUTPUT_PATH = "iterated_paraphrases.csv"
MAX_ITERATIONS = 1
TARGET_SCORES = {
    "empathy": 0.9,
    "toxicity": 0.1,
    "bias": 0.1,
    "hallucination": 0.1,
    "reward": 0.25
}

def meets_targets(scores):
    return (scores["empathy"] >= TARGET_SCORES["empathy"] and
            scores["toxicity"] <= TARGET_SCORES["toxicity"] and
            scores["bias"] <= TARGET_SCORES["bias"] and
            scores["hallucination"] <= TARGET_SCORES["hallucination"] and
            scores["reward"] >= TARGET_SCORES["reward"])

def generate_new_paraphrase(original, current_paraphrase, current_scores, issues):
    prompt = (
        f"Original comment: '{original}'. "
        f"Current paraphrase: '{current_paraphrase}'. "
        f"Current scores: {current_scores}. "
        f"Human feedback: {issues}. "
        f"Generate a new paraphrase that improves empathy (>= {TARGET_SCORES['empathy']}), "
        f"reduces toxicity (<= {TARGET_SCORES['toxicity']}), bias (<= {TARGET_SCORES['bias']}), "
        f"and hallucination (<= {TARGET_SCORES['hallucination']}), and increases reward (>= {TARGET_SCORES['reward']})."
    )
    return paraphrase_comment(prompt)

def refine_paraphrase(row: pd.Series) -> tuple:
    """
    Iteratively refine a single paraphrase.
    Returns new paraphrase, scores, and reasoning.
    """
    original = row["Comment"]
    current_paraphrase = row["Refined_Paraphrase"]
    current_scores = {
        "empathy": row["Refined_Empathy"],
        "toxicity": row["Refined_Toxicity"],
        "bias": row["Refined_Bias"],
        "hallucination": row["Refined_Hallucination"],
        "reward": row["Refined_Reward_Score"]
    }
    issues = row["Human_Evaluation_Reasoning"]
    iteration = 0
    reasoning = []
    print(f"Processing comment: {original}")

    while iteration < MAX_ITERATIONS and not meets_targets(current_scores):
        print(f"Starting iteration {iteration + 1} for comment: {original}")
        # Generate new paraphrase
        new_paraphrase = generate_new_paraphrase(original, current_paraphrase, current_scores, issues)
        print(f"Generated paraphrase: {new_paraphrase}")
        
        # Check if paraphrasing failed
        if "Error: Unable to generate paraphrase" in new_paraphrase:
            reasoning.append(f"Iteration {iteration + 1}: Paraphrasing failed - {new_paraphrase}")
            break

        # Evaluate new paraphrase
        new_scores = compute_reward_scores(original, new_paraphrase)
        print(f"New scores: {new_scores}")
        # Log reasoning
        reasoning.append(
            f"Iteration {iteration + 1}: Generated '{new_paraphrase}' with scores {new_scores}. "
            f"Previous scores {current_scores}."
        )
        # Update if improved
        if new_scores["reward"] > current_scores["reward"]:
            current_paraphrase = new_paraphrase
            current_scores = new_scores
            reasoning.append("Accepted new paraphrase due to improved reward score.")
        else:
            reasoning.append("Rejected new paraphrase; no improvement in reward score.")
        iteration += 1

    print(f"Finished processing comment: {original}")
    return current_paraphrase, current_scores, "; ".join(reasoning)

def main():
    # Load dataset from Hugging Face Hub
    try:
        df = load_dataset(DATA_PATH, split="train").to_pandas()[:1]  # Process only 1 row
    except Exception as e:
        print(f"Error loading dataset: {str(e)}")
        return

    results = []
    for idx, row in df.iterrows():
        new_paraphrase, new_scores, reasoning = refine_paraphrase(row)
        result = {
            "Comment": row["Comment"],
            "Original_Paraphrase": row["Original_Paraphrase"],
            "Refined_Paraphrase": row["Refined_Paraphrase"],
            "Iterated_Paraphrase": new_paraphrase,
            "Original_Reward_Score": row["Original_Reward_Score"],
            "Refined_Reward_Score": row["Refined_Reward_Score"],
            "Iterated_Reward_Score": new_scores["reward"],
            "Iterated_Empathy": new_scores["empathy"],
            "Iterated_Toxicity": new_scores["toxicity"],
            "Iterated_Bias": new_scores["bias"],
            "Iterated_Hallucination": new_scores["hallucination"],
            "Iteration_Reasoning": reasoning
        }
        results.append(result)

    # Save results to CSV
    result_df = pd.DataFrame(results)
    result_df.to_csv(OUTPUT_PATH, index=False)
    print(f"Refinement complete. Results saved to {OUTPUT_PATH}")

    # Push to Hugging Face Hub
    try:
        dataset = load_dataset("pandas", data_files=OUTPUT_PATH)
        dataset.push_to_hub("JanviMl/toxi_iterated_paraphrases", token=os.getenv("HF_TOKEN"))
        print("Pushed to Hugging Face Hub: JanviMl/toxi_iterated_paraphrases")
    except Exception as e:
        print(f"Error pushing to Hugging Face Hub: {str(e)}")

if __name__ == "__main__":
    main()