toxic-comment-classifier_rlhf

Sleeping

App Files Files Community

JanviMl commited on Apr 19

Commit

77f7351

verified ·

1 Parent(s): 5094855

Update refine_paraphrases.py

Browse files

Files changed (1) hide show

refine_paraphrases.py +36 -36

refine_paraphrases.py CHANGED Viewed

@@ -1,15 +1,14 @@
 # refine_paraphrases.py
 import pandas as pd
 from paraphraser import paraphrase_comment
-from classifier import compute_reward_scores
-from model_loader import paraphraser_model
-from datasets import load_dataset
 import os
 # Configuration
 DATA_PATH = "JanviMl/toxi_refined_paraphrases"
 OUTPUT_PATH = "iterated_paraphrases.csv"
-MAX_ITERATIONS = 3
 TARGET_SCORES = {
     "empathy": 0.9,
     "toxicity": 0.1,
@@ -18,33 +17,25 @@ TARGET_SCORES = {
     "reward": 0.25
 }
-def generate_new_paraphrase(original: str, current: str, scores: dict, issues: str) -> str:
-    """
-    Generate a new paraphrase using the Granite 3.2-2B-Instruct model.
-    Incorporates feedback from human evaluation and reward scores.
-    """
     prompt = (
-        f"You are a content moderator refining a comment paraphrase to be emotionally intelligent, non-toxic, and professional while retaining the original intent. "
-        f"The original comment is: \"{original}\". The current paraphrase is: \"{current}\". "
-        f"Reward model scores: Empathy={scores['empathy']}, Toxicity={scores['toxicity']}, Bias={scores['bias']}, Hallucination={scores['hallucination']}, Overall={scores['reward']}. "
-        f"Issues: {issues}. "
-        f"Guidelines: Remove hate speech, keep neutral/professional tone, retain intent, match brevity. "
-        f"Generate a new paraphrase that improves empathy (≥0.9), reduces toxicity/bias/hallucination (≤0.1), and increases reward score (≥0.25)."
     )
     return paraphrase_comment(prompt)
-def meets_targets(scores: dict) -> bool:
-    """
-    Check if paraphrase scores meet target thresholds.
-    """
-    return (
-        scores["empathy"] >= TARGET_SCORES["empathy"] and
-        scores["toxicity"] <= TARGET_SCORES["toxicity"] and
-        scores["bias"] <= TARGET_SCORES["bias"] and
-        scores["hallucination"] <= TARGET_SCORES["hallucination"] and
-        scores["reward"] >= TARGET_SCORES["reward"]
-    )
 def refine_paraphrase(row: pd.Series) -> tuple:
     """
     Iteratively refine a single paraphrase.
@@ -62,12 +53,22 @@ def refine_paraphrase(row: pd.Series) -> tuple:
     issues = row["Human_Evaluation_Reasoning"]
     iteration = 0
     reasoning = []
     while iteration < MAX_ITERATIONS and not meets_targets(current_scores):
         # Generate new paraphrase
         new_paraphrase = generate_new_paraphrase(original, current_paraphrase, current_scores, issues)
         # Evaluate new paraphrase
         new_scores = compute_reward_scores(original, new_paraphrase)
         # Log reasoning
         reasoning.append(
             f"Iteration {iteration + 1}: Generated '{new_paraphrase}' with scores {new_scores}. "
@@ -82,17 +83,17 @@ def refine_paraphrase(row: pd.Series) -> tuple:
             reasoning.append("Rejected new paraphrase; no improvement in reward score.")
         iteration += 1
     return current_paraphrase, current_scores, "; ".join(reasoning)
 def main():
     # Load dataset from Hugging Face Hub
     try:
-        df = load_dataset(DATA_PATH, split="train").to_pandas()
     except Exception as e:
         print(f"Error loading dataset: {str(e)}")
         return
-    # Process each row
     results = []
     for idx, row in df.iterrows():
         new_paraphrase, new_scores, reasoning = refine_paraphrase(row)
@@ -111,20 +112,19 @@ def main():
             "Iteration_Reasoning": reasoning
         }
         results.append(result)
-    # Save results locally
     result_df = pd.DataFrame(results)
     result_df.to_csv(OUTPUT_PATH, index=False)
     print(f"Refinement complete. Results saved to {OUTPUT_PATH}")
     # Push to Hugging Face Hub
     try:
-        from datasets import Dataset
-        dataset = Dataset.from_pandas(result_df)
         dataset.push_to_hub("JanviMl/toxi_iterated_paraphrases", token=os.getenv("HF_TOKEN"))
         print("Pushed to Hugging Face Hub: JanviMl/toxi_iterated_paraphrases")
     except Exception as e:
-        print(f"Error pushing to Hub: {str(e)}")
 if __name__ == "__main__":
     main()

 # refine_paraphrases.py
+from datasets import load_dataset
 import pandas as pd
 from paraphraser import paraphrase_comment
+from metrics import compute_reward_scores
 import os
 # Configuration
 DATA_PATH = "JanviMl/toxi_refined_paraphrases"
 OUTPUT_PATH = "iterated_paraphrases.csv"
+MAX_ITERATIONS = 1
 TARGET_SCORES = {
     "empathy": 0.9,
     "toxicity": 0.1,
     "reward": 0.25
 }
+def meets_targets(scores):
+    return (scores["empathy"] >= TARGET_SCORES["empathy"] and
+            scores["toxicity"] <= TARGET_SCORES["toxicity"] and
+            scores["bias"] <= TARGET_SCORES["bias"] and
+            scores["hallucination"] <= TARGET_SCORES["hallucination"] and
+            scores["reward"] >= TARGET_SCORES["reward"])
+def generate_new_paraphrase(original, current_paraphrase, current_scores, issues):
     prompt = (
+        f"Original comment: '{original}'. "
+        f"Current paraphrase: '{current_paraphrase}'. "
+        f"Current scores: {current_scores}. "
+        f"Human feedback: {issues}. "
+        f"Generate a new paraphrase that improves empathy (>= {TARGET_SCORES['empathy']}), "
+        f"reduces toxicity (<= {TARGET_SCORES['toxicity']}), bias (<= {TARGET_SCORES['bias']}), "
+        f"and hallucination (<= {TARGET_SCORES['hallucination']}), and increases reward (>= {TARGET_SCORES['reward']})."
     )
     return paraphrase_comment(prompt)
 def refine_paraphrase(row: pd.Series) -> tuple:
     """
     Iteratively refine a single paraphrase.
     issues = row["Human_Evaluation_Reasoning"]
     iteration = 0
     reasoning = []
+    print(f"Processing comment: {original}")
     while iteration < MAX_ITERATIONS and not meets_targets(current_scores):
+        print(f"Starting iteration {iteration + 1} for comment: {original}")
         # Generate new paraphrase
         new_paraphrase = generate_new_paraphrase(original, current_paraphrase, current_scores, issues)
+        print(f"Generated paraphrase: {new_paraphrase}")
+        # Check if paraphrasing failed
+        if "Error: Unable to generate paraphrase" in new_paraphrase:
+            reasoning.append(f"Iteration {iteration + 1}: Paraphrasing failed - {new_paraphrase}")
+            break
         # Evaluate new paraphrase
         new_scores = compute_reward_scores(original, new_paraphrase)
+        print(f"New scores: {new_scores}")
         # Log reasoning
         reasoning.append(
             f"Iteration {iteration + 1}: Generated '{new_paraphrase}' with scores {new_scores}. "
             reasoning.append("Rejected new paraphrase; no improvement in reward score.")
         iteration += 1
+    print(f"Finished processing comment: {original}")
     return current_paraphrase, current_scores, "; ".join(reasoning)
 def main():
     # Load dataset from Hugging Face Hub
     try:
+        df = load_dataset(DATA_PATH, split="train").to_pandas()[:1]  # Process only 1 row
     except Exception as e:
         print(f"Error loading dataset: {str(e)}")
         return
     results = []
     for idx, row in df.iterrows():
         new_paraphrase, new_scores, reasoning = refine_paraphrase(row)
             "Iteration_Reasoning": reasoning
         }
         results.append(result)
+    # Save results to CSV
     result_df = pd.DataFrame(results)
     result_df.to_csv(OUTPUT_PATH, index=False)
     print(f"Refinement complete. Results saved to {OUTPUT_PATH}")
     # Push to Hugging Face Hub
     try:
+        dataset = load_dataset("pandas", data_files=OUTPUT_PATH)
         dataset.push_to_hub("JanviMl/toxi_iterated_paraphrases", token=os.getenv("HF_TOKEN"))
         print("Pushed to Hugging Face Hub: JanviMl/toxi_iterated_paraphrases")
     except Exception as e:
+        print(f"Error pushing to Hugging Face Hub: {str(e)}")
 if __name__ == "__main__":
     main()