Jen Ben Arye commited on
Commit
755d824
·
1 Parent(s): 85e41fb

process feel dataset for kto trainer

Browse files
Files changed (1) hide show
  1. ml/kto_dataset_processor.py +115 -44
ml/kto_dataset_processor.py CHANGED
@@ -1,65 +1,136 @@
1
- from datasets import load_dataset, Dataset
2
  import pandas as pd
3
- from pdb import set_trace as st
 
 
4
 
5
 
6
- def process_dataset_ultrafeedback():
 
7
  """
8
- Processes the 'train_prefs' and 'test_prefs' splits of the 'HuggingFaceH4/ultrafeedback_binarized' dataset
9
- into a unified format for preference modeling.
 
 
10
 
11
  Returns:
12
- dict: A dictionary containing the unified 'train' and 'test' splits of the dataset in the KTO format.
13
- Each split is a Hugging Face Dataset object.
14
  """
15
- # Load the relevant splits of the dataset
16
- dataset_name = "HuggingFaceH4/ultrafeedback_binarized"
17
- train_prefs = load_dataset(dataset_name, split="train_prefs")
18
- test_prefs = load_dataset(dataset_name, split="test_prefs")
19
 
20
- # Function to transform a single example into the desired schema
21
- def transform_data(example):
 
 
 
 
 
 
 
 
 
 
22
  data_points = []
23
- # Chosen completion
24
- chosen_completion = example["chosen"][1]["content"]
25
- if chosen_completion.strip(): # Check for non-empty completions
26
- data_points.append({
27
- "prompt": example["prompt"],
28
- "completion": chosen_completion.strip(),
29
- "label": True
30
- })
31
- # Rejected completion
32
- rejected_completion = example["rejected"][1]["content"]
33
- if rejected_completion.strip(): # Check for non-empty completions
34
- data_points.append({
35
- "prompt": example["prompt"],
36
- "completion": rejected_completion.strip(),
37
- "label": False
38
- })
39
- return data_points
40
 
41
- # Process train and test splits
42
- train_data = []
43
- test_data = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- for example in train_prefs:
46
- train_data.extend(transform_data(example))
 
47
 
48
- for example in test_prefs:
49
- test_data.extend(transform_data(example))
50
 
51
- # Convert unified data to DataFrames
52
- train_df = pd.DataFrame(train_data)
53
- test_df = pd.DataFrame(test_data)
54
 
 
 
 
55
 
56
  # Convert to Hugging Face Dataset
57
- unified_train = Dataset.from_pandas(train_df)
58
- unified_test = Dataset.from_pandas(test_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
- return {"train": unified_train, "test": unified_test}
61
 
62
 
63
  if __name__ == "__main__":
64
- kto_dataset = process_dataset_ultrafeedback()
65
  st()
 
1
+ from datasets import Dataset, load_dataset
2
  import pandas as pd
3
+ from sklearn.model_selection import train_test_split
4
+ import json
5
+ from ipdb import set_trace as st
6
 
7
 
8
+
9
+ def process_feel_dataset():
10
  """
11
+ Processes the feel dataset into a format suitable for KTO training using TRL.
12
+
13
+ Args:
14
+ data (list): A list of dictionaries containing conversation data.
15
 
16
  Returns:
17
+ dict: A dictionary containing the 'train' and 'test' splits of the dataset in KTO format, as Hugging Face Dataset objects.
 
18
  """
 
 
 
 
19
 
20
+ # Load feel dataset
21
+ # Load the JSON file
22
+ file_path = "../data/example_data.json"
23
+ with open(file_path, "r") as file:
24
+ feel_dataset = json.load(file)
25
+
26
+
27
+ kto_data = []
28
+
29
+ # Function to transform a single conversation into KTO format
30
+ def transform_conversation(entry):
31
+ conversation = entry["conversation"]
32
  data_points = []
33
+ user_timestamp = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ for i in range(len(conversation)):
36
+ message = conversation[i]
37
+ if message["role"] == "user":
38
+ user_timestamp = entry["timestamp"]
39
+ if (
40
+ message["role"] == "assistant" and
41
+ message["rating"] in [1, -1] # Only process feedback with positive or negative ratings
42
+ ):
43
+ user_content = conversation[i - 1]["content"] if i > 0 and conversation[i - 1]["role"] == "user" else ""
44
+ data_points.append({
45
+ "prompt": user_content.strip(),
46
+ "completion": message["content"].strip(),
47
+ "label": message["rating"] == 1, # True for positive feedback, False for negative (KTO Trainer format)
48
+ "timestamp": user_timestamp,
49
+ "session_id": entry["session_id"],
50
+ "conversation_id": entry["conversation_id"]
51
+ })
52
+ return data_points
53
 
54
+ # Process all conversations in the dataset
55
+ for entry in feel_dataset:
56
+ kto_data.extend(transform_conversation(entry))
57
 
58
+ # Convert to DataFrame
59
+ kto_df = pd.DataFrame(kto_data)
60
 
61
+ # Split into train and test sets (70% train, 30% test)
62
+ train_df, test_df = train_test_split(kto_df, test_size=0.3, random_state=42)
 
63
 
64
+ # Reset index to remove '__index_level_0__'
65
+ train_df = train_df.reset_index(drop=True)
66
+ test_df = test_df.reset_index(drop=True)
67
 
68
  # Convert to Hugging Face Dataset
69
+ train_dataset = Dataset.from_pandas(train_df)
70
+ test_dataset = Dataset.from_pandas(test_df)
71
+
72
+ return {"train": train_dataset, "test": test_dataset}
73
+
74
+
75
+
76
+
77
+ # def process_dataset_ultrafeedback():
78
+ # """
79
+ # Processes the 'train_prefs' and 'test_prefs' splits of the 'HuggingFaceH4/ultrafeedback_binarized' dataset
80
+ # into a unified format for preference modeling.
81
+
82
+ # Returns:
83
+ # dict: A dictionary containing the unified 'train' and 'test' splits of the dataset in the KTO format.
84
+ # Each split is a Hugging Face Dataset object.
85
+ # """
86
+ # # Load the relevant splits of the dataset
87
+ # dataset_name = "HuggingFaceH4/ultrafeedback_binarized"
88
+ # train_prefs = load_dataset(dataset_name, split="train_prefs")
89
+ # test_prefs = load_dataset(dataset_name, split="test_prefs")
90
+
91
+ # # Function to transform a single example into the desired schema
92
+ # def transform_data(example):
93
+ # data_points = []
94
+ # # Chosen completion
95
+ # chosen_completion = example["chosen"][1]["content"]
96
+ # if chosen_completion.strip(): # Check for non-empty completions
97
+ # data_points.append({
98
+ # "prompt": example["prompt"],
99
+ # "completion": chosen_completion.strip(),
100
+ # "label": True
101
+ # })
102
+ # # Rejected completion
103
+ # rejected_completion = example["rejected"][1]["content"]
104
+ # if rejected_completion.strip(): # Check for non-empty completions
105
+ # data_points.append({
106
+ # "prompt": example["prompt"],
107
+ # "completion": rejected_completion.strip(),
108
+ # "label": False
109
+ # })
110
+ # return data_points
111
+
112
+ # # Process train and test splits
113
+ # train_data = []
114
+ # test_data = []
115
+
116
+ # for example in train_prefs:
117
+ # train_data.extend(transform_data(example))
118
+
119
+ # for example in test_prefs:
120
+ # test_data.extend(transform_data(example))
121
+
122
+ # # Convert unified data to DataFrames
123
+ # train_df = pd.DataFrame(train_data)
124
+ # test_df = pd.DataFrame(test_data)
125
+
126
+
127
+ # # Convert to Hugging Face Dataset
128
+ # unified_train = Dataset.from_pandas(train_df)
129
+ # unified_test = Dataset.from_pandas(test_df)
130
 
131
+ # return {"train": unified_train, "test": unified_test}
132
 
133
 
134
  if __name__ == "__main__":
135
+ kto_dataset = process_feel_dataset()
136
  st()