Spaces:
Running
on
Zero
Running
on
Zero
Jen Ben Arye
commited on
Commit
·
755d824
1
Parent(s):
85e41fb
process feel dataset for kto trainer
Browse files- ml/kto_dataset_processor.py +115 -44
ml/kto_dataset_processor.py
CHANGED
@@ -1,65 +1,136 @@
|
|
1 |
-
from datasets import
|
2 |
import pandas as pd
|
3 |
-
from
|
|
|
|
|
4 |
|
5 |
|
6 |
-
|
|
|
7 |
"""
|
8 |
-
Processes the
|
9 |
-
|
|
|
|
|
10 |
|
11 |
Returns:
|
12 |
-
dict: A dictionary containing the
|
13 |
-
Each split is a Hugging Face Dataset object.
|
14 |
"""
|
15 |
-
# Load the relevant splits of the dataset
|
16 |
-
dataset_name = "HuggingFaceH4/ultrafeedback_binarized"
|
17 |
-
train_prefs = load_dataset(dataset_name, split="train_prefs")
|
18 |
-
test_prefs = load_dataset(dataset_name, split="test_prefs")
|
19 |
|
20 |
-
#
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
data_points = []
|
23 |
-
|
24 |
-
chosen_completion = example["chosen"][1]["content"]
|
25 |
-
if chosen_completion.strip(): # Check for non-empty completions
|
26 |
-
data_points.append({
|
27 |
-
"prompt": example["prompt"],
|
28 |
-
"completion": chosen_completion.strip(),
|
29 |
-
"label": True
|
30 |
-
})
|
31 |
-
# Rejected completion
|
32 |
-
rejected_completion = example["rejected"][1]["content"]
|
33 |
-
if rejected_completion.strip(): # Check for non-empty completions
|
34 |
-
data_points.append({
|
35 |
-
"prompt": example["prompt"],
|
36 |
-
"completion": rejected_completion.strip(),
|
37 |
-
"label": False
|
38 |
-
})
|
39 |
-
return data_points
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
-
|
46 |
-
|
|
|
47 |
|
48 |
-
|
49 |
-
|
50 |
|
51 |
-
#
|
52 |
-
train_df =
|
53 |
-
test_df = pd.DataFrame(test_data)
|
54 |
|
|
|
|
|
|
|
55 |
|
56 |
# Convert to Hugging Face Dataset
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
-
|
61 |
|
62 |
|
63 |
if __name__ == "__main__":
|
64 |
-
kto_dataset =
|
65 |
st()
|
|
|
1 |
+
from datasets import Dataset, load_dataset
|
2 |
import pandas as pd
|
3 |
+
from sklearn.model_selection import train_test_split
|
4 |
+
import json
|
5 |
+
from ipdb import set_trace as st
|
6 |
|
7 |
|
8 |
+
|
9 |
+
def process_feel_dataset():
|
10 |
"""
|
11 |
+
Processes the feel dataset into a format suitable for KTO training using TRL.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
data (list): A list of dictionaries containing conversation data.
|
15 |
|
16 |
Returns:
|
17 |
+
dict: A dictionary containing the 'train' and 'test' splits of the dataset in KTO format, as Hugging Face Dataset objects.
|
|
|
18 |
"""
|
|
|
|
|
|
|
|
|
19 |
|
20 |
+
# Load feel dataset
|
21 |
+
# Load the JSON file
|
22 |
+
file_path = "../data/example_data.json"
|
23 |
+
with open(file_path, "r") as file:
|
24 |
+
feel_dataset = json.load(file)
|
25 |
+
|
26 |
+
|
27 |
+
kto_data = []
|
28 |
+
|
29 |
+
# Function to transform a single conversation into KTO format
|
30 |
+
def transform_conversation(entry):
|
31 |
+
conversation = entry["conversation"]
|
32 |
data_points = []
|
33 |
+
user_timestamp = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
+
for i in range(len(conversation)):
|
36 |
+
message = conversation[i]
|
37 |
+
if message["role"] == "user":
|
38 |
+
user_timestamp = entry["timestamp"]
|
39 |
+
if (
|
40 |
+
message["role"] == "assistant" and
|
41 |
+
message["rating"] in [1, -1] # Only process feedback with positive or negative ratings
|
42 |
+
):
|
43 |
+
user_content = conversation[i - 1]["content"] if i > 0 and conversation[i - 1]["role"] == "user" else ""
|
44 |
+
data_points.append({
|
45 |
+
"prompt": user_content.strip(),
|
46 |
+
"completion": message["content"].strip(),
|
47 |
+
"label": message["rating"] == 1, # True for positive feedback, False for negative (KTO Trainer format)
|
48 |
+
"timestamp": user_timestamp,
|
49 |
+
"session_id": entry["session_id"],
|
50 |
+
"conversation_id": entry["conversation_id"]
|
51 |
+
})
|
52 |
+
return data_points
|
53 |
|
54 |
+
# Process all conversations in the dataset
|
55 |
+
for entry in feel_dataset:
|
56 |
+
kto_data.extend(transform_conversation(entry))
|
57 |
|
58 |
+
# Convert to DataFrame
|
59 |
+
kto_df = pd.DataFrame(kto_data)
|
60 |
|
61 |
+
# Split into train and test sets (70% train, 30% test)
|
62 |
+
train_df, test_df = train_test_split(kto_df, test_size=0.3, random_state=42)
|
|
|
63 |
|
64 |
+
# Reset index to remove '__index_level_0__'
|
65 |
+
train_df = train_df.reset_index(drop=True)
|
66 |
+
test_df = test_df.reset_index(drop=True)
|
67 |
|
68 |
# Convert to Hugging Face Dataset
|
69 |
+
train_dataset = Dataset.from_pandas(train_df)
|
70 |
+
test_dataset = Dataset.from_pandas(test_df)
|
71 |
+
|
72 |
+
return {"train": train_dataset, "test": test_dataset}
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
|
77 |
+
# def process_dataset_ultrafeedback():
|
78 |
+
# """
|
79 |
+
# Processes the 'train_prefs' and 'test_prefs' splits of the 'HuggingFaceH4/ultrafeedback_binarized' dataset
|
80 |
+
# into a unified format for preference modeling.
|
81 |
+
|
82 |
+
# Returns:
|
83 |
+
# dict: A dictionary containing the unified 'train' and 'test' splits of the dataset in the KTO format.
|
84 |
+
# Each split is a Hugging Face Dataset object.
|
85 |
+
# """
|
86 |
+
# # Load the relevant splits of the dataset
|
87 |
+
# dataset_name = "HuggingFaceH4/ultrafeedback_binarized"
|
88 |
+
# train_prefs = load_dataset(dataset_name, split="train_prefs")
|
89 |
+
# test_prefs = load_dataset(dataset_name, split="test_prefs")
|
90 |
+
|
91 |
+
# # Function to transform a single example into the desired schema
|
92 |
+
# def transform_data(example):
|
93 |
+
# data_points = []
|
94 |
+
# # Chosen completion
|
95 |
+
# chosen_completion = example["chosen"][1]["content"]
|
96 |
+
# if chosen_completion.strip(): # Check for non-empty completions
|
97 |
+
# data_points.append({
|
98 |
+
# "prompt": example["prompt"],
|
99 |
+
# "completion": chosen_completion.strip(),
|
100 |
+
# "label": True
|
101 |
+
# })
|
102 |
+
# # Rejected completion
|
103 |
+
# rejected_completion = example["rejected"][1]["content"]
|
104 |
+
# if rejected_completion.strip(): # Check for non-empty completions
|
105 |
+
# data_points.append({
|
106 |
+
# "prompt": example["prompt"],
|
107 |
+
# "completion": rejected_completion.strip(),
|
108 |
+
# "label": False
|
109 |
+
# })
|
110 |
+
# return data_points
|
111 |
+
|
112 |
+
# # Process train and test splits
|
113 |
+
# train_data = []
|
114 |
+
# test_data = []
|
115 |
+
|
116 |
+
# for example in train_prefs:
|
117 |
+
# train_data.extend(transform_data(example))
|
118 |
+
|
119 |
+
# for example in test_prefs:
|
120 |
+
# test_data.extend(transform_data(example))
|
121 |
+
|
122 |
+
# # Convert unified data to DataFrames
|
123 |
+
# train_df = pd.DataFrame(train_data)
|
124 |
+
# test_df = pd.DataFrame(test_data)
|
125 |
+
|
126 |
+
|
127 |
+
# # Convert to Hugging Face Dataset
|
128 |
+
# unified_train = Dataset.from_pandas(train_df)
|
129 |
+
# unified_test = Dataset.from_pandas(test_df)
|
130 |
|
131 |
+
# return {"train": unified_train, "test": unified_test}
|
132 |
|
133 |
|
134 |
if __name__ == "__main__":
|
135 |
+
kto_dataset = process_feel_dataset()
|
136 |
st()
|