|
|
|
from datasets import load_dataset |
|
import pandas as pd |
|
from paraphraser import paraphrase_comment |
|
from metrics import compute_reward_scores |
|
import os |
|
|
|
|
|
DATA_PATH = "JanviMl/toxi_refined_paraphrases" |
|
OUTPUT_PATH = "iterated_paraphrases.csv" |
|
MAX_ITERATIONS = 1 |
|
TARGET_SCORES = { |
|
"empathy": 0.9, |
|
"toxicity": 0.1, |
|
"bias": 0.1, |
|
"hallucination": 0.1, |
|
"reward": 0.25 |
|
} |
|
|
|
def meets_targets(scores): |
|
return (scores["empathy"] >= TARGET_SCORES["empathy"] and |
|
scores["toxicity"] <= TARGET_SCORES["toxicity"] and |
|
scores["bias"] <= TARGET_SCORES["bias"] and |
|
scores["hallucination"] <= TARGET_SCORES["hallucination"] and |
|
scores["reward"] >= TARGET_SCORES["reward"]) |
|
|
|
def generate_new_paraphrase(original, current_paraphrase, current_scores, issues): |
|
prompt = ( |
|
f"Original comment: '{original}'. " |
|
f"Current paraphrase: '{current_paraphrase}'. " |
|
f"Current scores: {current_scores}. " |
|
f"Human feedback: {issues}. " |
|
f"Generate a new paraphrase that improves empathy (>= {TARGET_SCORES['empathy']}), " |
|
f"reduces toxicity (<= {TARGET_SCORES['toxicity']}), bias (<= {TARGET_SCORES['bias']}), " |
|
f"and hallucination (<= {TARGET_SCORES['hallucination']}), and increases reward (>= {TARGET_SCORES['reward']})." |
|
) |
|
return paraphrase_comment(prompt) |
|
|
|
def refine_paraphrase(row: pd.Series) -> tuple: |
|
""" |
|
Iteratively refine a single paraphrase. |
|
Returns new paraphrase, scores, and reasoning. |
|
""" |
|
original = row["Comment"] |
|
current_paraphrase = row["Refined_Paraphrase"] |
|
current_scores = { |
|
"empathy": row["Refined_Empathy"], |
|
"toxicity": row["Refined_Toxicity"], |
|
"bias": row["Refined_Bias"], |
|
"hallucination": row["Refined_Hallucination"], |
|
"reward": row["Refined_Reward_Score"] |
|
} |
|
issues = row["Human_Evaluation_Reasoning"] |
|
iteration = 0 |
|
reasoning = [] |
|
print(f"Processing comment: {original}") |
|
|
|
while iteration < MAX_ITERATIONS and not meets_targets(current_scores): |
|
print(f"Starting iteration {iteration + 1} for comment: {original}") |
|
|
|
new_paraphrase = generate_new_paraphrase(original, current_paraphrase, current_scores, issues) |
|
print(f"Generated paraphrase: {new_paraphrase}") |
|
|
|
|
|
if "Error: Unable to generate paraphrase" in new_paraphrase: |
|
reasoning.append(f"Iteration {iteration + 1}: Paraphrasing failed - {new_paraphrase}") |
|
break |
|
|
|
|
|
new_scores = compute_reward_scores(original, new_paraphrase) |
|
print(f"New scores: {new_scores}") |
|
|
|
reasoning.append( |
|
f"Iteration {iteration + 1}: Generated '{new_paraphrase}' with scores {new_scores}. " |
|
f"Previous scores {current_scores}." |
|
) |
|
|
|
if new_scores["reward"] > current_scores["reward"]: |
|
current_paraphrase = new_paraphrase |
|
current_scores = new_scores |
|
reasoning.append("Accepted new paraphrase due to improved reward score.") |
|
else: |
|
reasoning.append("Rejected new paraphrase; no improvement in reward score.") |
|
iteration += 1 |
|
|
|
print(f"Finished processing comment: {original}") |
|
return current_paraphrase, current_scores, "; ".join(reasoning) |
|
|
|
def main(): |
|
|
|
try: |
|
df = load_dataset(DATA_PATH, split="train").to_pandas()[:1] |
|
except Exception as e: |
|
print(f"Error loading dataset: {str(e)}") |
|
return |
|
|
|
results = [] |
|
for idx, row in df.iterrows(): |
|
new_paraphrase, new_scores, reasoning = refine_paraphrase(row) |
|
result = { |
|
"Comment": row["Comment"], |
|
"Original_Paraphrase": row["Original_Paraphrase"], |
|
"Refined_Paraphrase": row["Refined_Paraphrase"], |
|
"Iterated_Paraphrase": new_paraphrase, |
|
"Original_Reward_Score": row["Original_Reward_Score"], |
|
"Refined_Reward_Score": row["Refined_Reward_Score"], |
|
"Iterated_Reward_Score": new_scores["reward"], |
|
"Iterated_Empathy": new_scores["empathy"], |
|
"Iterated_Toxicity": new_scores["toxicity"], |
|
"Iterated_Bias": new_scores["bias"], |
|
"Iterated_Hallucination": new_scores["hallucination"], |
|
"Iteration_Reasoning": reasoning |
|
} |
|
results.append(result) |
|
|
|
|
|
result_df = pd.DataFrame(results) |
|
result_df.to_csv(OUTPUT_PATH, index=False) |
|
print(f"Refinement complete. Results saved to {OUTPUT_PATH}") |
|
|
|
|
|
try: |
|
dataset = load_dataset("pandas", data_files=OUTPUT_PATH) |
|
dataset.push_to_hub("JanviMl/toxi_iterated_paraphrases", token=os.getenv("HF_TOKEN")) |
|
print("Pushed to Hugging Face Hub: JanviMl/toxi_iterated_paraphrases") |
|
except Exception as e: |
|
print(f"Error pushing to Hugging Face Hub: {str(e)}") |
|
|
|
if __name__ == "__main__": |
|
main() |