File size: 5,164 Bytes
8cbcee4 77f7351 8cbcee4 77f7351 39459c9 8cbcee4 39459c9 77f7351 8cbcee4 77f7351 8cbcee4 77f7351 8cbcee4 77f7351 8cbcee4 77f7351 8cbcee4 77f7351 8cbcee4 77f7351 8cbcee4 77f7351 8cbcee4 39459c9 77f7351 39459c9 77f7351 8cbcee4 77f7351 8cbcee4 77f7351 39459c9 77f7351 39459c9 77f7351 8cbcee4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
# refine_paraphrases.py
from datasets import load_dataset
import pandas as pd
from paraphraser import paraphrase_comment
from metrics import compute_reward_scores
import os
# Configuration
DATA_PATH = "JanviMl/toxi_refined_paraphrases"
OUTPUT_PATH = "iterated_paraphrases.csv"
MAX_ITERATIONS = 1
TARGET_SCORES = {
"empathy": 0.9,
"toxicity": 0.1,
"bias": 0.1,
"hallucination": 0.1,
"reward": 0.25
}
def meets_targets(scores):
return (scores["empathy"] >= TARGET_SCORES["empathy"] and
scores["toxicity"] <= TARGET_SCORES["toxicity"] and
scores["bias"] <= TARGET_SCORES["bias"] and
scores["hallucination"] <= TARGET_SCORES["hallucination"] and
scores["reward"] >= TARGET_SCORES["reward"])
def generate_new_paraphrase(original, current_paraphrase, current_scores, issues):
prompt = (
f"Original comment: '{original}'. "
f"Current paraphrase: '{current_paraphrase}'. "
f"Current scores: {current_scores}. "
f"Human feedback: {issues}. "
f"Generate a new paraphrase that improves empathy (>= {TARGET_SCORES['empathy']}), "
f"reduces toxicity (<= {TARGET_SCORES['toxicity']}), bias (<= {TARGET_SCORES['bias']}), "
f"and hallucination (<= {TARGET_SCORES['hallucination']}), and increases reward (>= {TARGET_SCORES['reward']})."
)
return paraphrase_comment(prompt)
def refine_paraphrase(row: pd.Series) -> tuple:
"""
Iteratively refine a single paraphrase.
Returns new paraphrase, scores, and reasoning.
"""
original = row["Comment"]
current_paraphrase = row["Refined_Paraphrase"]
current_scores = {
"empathy": row["Refined_Empathy"],
"toxicity": row["Refined_Toxicity"],
"bias": row["Refined_Bias"],
"hallucination": row["Refined_Hallucination"],
"reward": row["Refined_Reward_Score"]
}
issues = row["Human_Evaluation_Reasoning"]
iteration = 0
reasoning = []
print(f"Processing comment: {original}")
while iteration < MAX_ITERATIONS and not meets_targets(current_scores):
print(f"Starting iteration {iteration + 1} for comment: {original}")
# Generate new paraphrase
new_paraphrase = generate_new_paraphrase(original, current_paraphrase, current_scores, issues)
print(f"Generated paraphrase: {new_paraphrase}")
# Check if paraphrasing failed
if "Error: Unable to generate paraphrase" in new_paraphrase:
reasoning.append(f"Iteration {iteration + 1}: Paraphrasing failed - {new_paraphrase}")
break
# Evaluate new paraphrase
new_scores = compute_reward_scores(original, new_paraphrase)
print(f"New scores: {new_scores}")
# Log reasoning
reasoning.append(
f"Iteration {iteration + 1}: Generated '{new_paraphrase}' with scores {new_scores}. "
f"Previous scores {current_scores}."
)
# Update if improved
if new_scores["reward"] > current_scores["reward"]:
current_paraphrase = new_paraphrase
current_scores = new_scores
reasoning.append("Accepted new paraphrase due to improved reward score.")
else:
reasoning.append("Rejected new paraphrase; no improvement in reward score.")
iteration += 1
print(f"Finished processing comment: {original}")
return current_paraphrase, current_scores, "; ".join(reasoning)
def main():
# Load dataset from Hugging Face Hub
try:
df = load_dataset(DATA_PATH, split="train").to_pandas()[:1] # Process only 1 row
except Exception as e:
print(f"Error loading dataset: {str(e)}")
return
results = []
for idx, row in df.iterrows():
new_paraphrase, new_scores, reasoning = refine_paraphrase(row)
result = {
"Comment": row["Comment"],
"Original_Paraphrase": row["Original_Paraphrase"],
"Refined_Paraphrase": row["Refined_Paraphrase"],
"Iterated_Paraphrase": new_paraphrase,
"Original_Reward_Score": row["Original_Reward_Score"],
"Refined_Reward_Score": row["Refined_Reward_Score"],
"Iterated_Reward_Score": new_scores["reward"],
"Iterated_Empathy": new_scores["empathy"],
"Iterated_Toxicity": new_scores["toxicity"],
"Iterated_Bias": new_scores["bias"],
"Iterated_Hallucination": new_scores["hallucination"],
"Iteration_Reasoning": reasoning
}
results.append(result)
# Save results to CSV
result_df = pd.DataFrame(results)
result_df.to_csv(OUTPUT_PATH, index=False)
print(f"Refinement complete. Results saved to {OUTPUT_PATH}")
# Push to Hugging Face Hub
try:
dataset = load_dataset("pandas", data_files=OUTPUT_PATH)
dataset.push_to_hub("JanviMl/toxi_iterated_paraphrases", token=os.getenv("HF_TOKEN"))
print("Pushed to Hugging Face Hub: JanviMl/toxi_iterated_paraphrases")
except Exception as e:
print(f"Error pushing to Hugging Face Hub: {str(e)}")
if __name__ == "__main__":
main() |