Spaces:
Running
on
Zero
Running
on
Zero
Jen Ben Arye
commited on
Commit
·
09e9f82
1
Parent(s):
36b0fc6
debugged kto dataset processor
Browse files- ml/kto_dataset_processor.py +53 -58
ml/kto_dataset_processor.py
CHANGED
@@ -3,18 +3,8 @@ import pandas as pd
|
|
3 |
from sklearn.model_selection import train_test_split
|
4 |
import json
|
5 |
from ipdb import set_trace as st
|
6 |
-
import tiktoken
|
7 |
from transformers import AutoTokenizer
|
8 |
|
9 |
-
def count_tokens(text: str, model_name: str) -> int:
|
10 |
-
"""Count tokens in text using model's tokenizer"""
|
11 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
12 |
-
return len(tokenizer.encode(text))
|
13 |
-
|
14 |
-
def format_conversation(messages: list, model_name: str) -> str:
|
15 |
-
"""Format messages using model's chat template"""
|
16 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
17 |
-
return tokenizer.apply_chat_template(messages, tokenize=False)
|
18 |
|
19 |
def transform_conversation(
|
20 |
entry: dict,
|
@@ -25,37 +15,59 @@ def transform_conversation(
|
|
25 |
"""Transform conversation into KTO format with history"""
|
26 |
data_points = []
|
27 |
conversation = entry["conversation"]
|
|
|
28 |
|
29 |
for i, message in enumerate(conversation):
|
30 |
-
# Only
|
31 |
if message["role"] != "assistant" or message["rating"] not in [1, -1]:
|
32 |
continue
|
33 |
|
34 |
# Get previous messages up to limits
|
35 |
-
|
|
|
36 |
tokens = 0
|
37 |
-
|
38 |
-
|
39 |
-
# Start from
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
data_points.append({
|
58 |
-
"prompt":
|
59 |
"completion": message["content"].strip(),
|
60 |
"label": message["rating"] == 1,
|
61 |
"timestamp": entry["timestamp"],
|
@@ -66,7 +78,7 @@ def transform_conversation(
|
|
66 |
return data_points
|
67 |
|
68 |
def process_feel_dataset(
|
69 |
-
model_name: str = "
|
70 |
max_history_turns: int = 10,
|
71 |
max_history_tokens: int = 4000
|
72 |
):
|
@@ -145,28 +157,11 @@ if __name__ == "__main__":
|
|
145 |
print("\nSample entries from processed KTO dataset:")
|
146 |
print("\n" + "="*80 + "\nTRAIN SET SAMPLES\n" + "="*80)
|
147 |
|
148 |
-
#
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
# else:
|
157 |
-
# print(f"{value}")
|
158 |
-
# print("\n" + "-"*80)
|
159 |
-
|
160 |
-
# print("\n" + "="*80 + "\nTEST SET SAMPLES\n" + "="*80)
|
161 |
-
|
162 |
-
# for i, example in enumerate(datasets['test'].select(range(min(3, len(datasets['test']))))):
|
163 |
-
# print(f"\nEntry #{i+1}:")
|
164 |
-
# print("-" * 40)
|
165 |
-
# for field, value in example.items():
|
166 |
-
# print(f"\n{field}:")
|
167 |
-
# if isinstance(value, str):
|
168 |
-
# # Print strings with line breaks for better readability
|
169 |
-
# print(f"{value}")
|
170 |
-
# else:
|
171 |
-
# print(f"{value}")
|
172 |
-
# print("\n" + "-"*80)
|
|
|
3 |
from sklearn.model_selection import train_test_split
|
4 |
import json
|
5 |
from ipdb import set_trace as st
|
|
|
6 |
from transformers import AutoTokenizer
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
def transform_conversation(
|
10 |
entry: dict,
|
|
|
15 |
"""Transform conversation into KTO format with history"""
|
16 |
data_points = []
|
17 |
conversation = entry["conversation"]
|
18 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
19 |
|
20 |
for i, message in enumerate(conversation):
|
21 |
+
# Only create data points for assistant messages that have ratings
|
22 |
if message["role"] != "assistant" or message["rating"] not in [1, -1]:
|
23 |
continue
|
24 |
|
25 |
# Get previous messages up to limits
|
26 |
+
formatted_history = []
|
27 |
+
formatted_prompt = ""
|
28 |
tokens = 0
|
29 |
+
pairs = 0 # Count complete user/assistant pairs
|
30 |
+
|
31 |
+
# Start from the current message and work backwards
|
32 |
+
current_idx = i - 1
|
33 |
+
while current_idx >= 0 and pairs < max_history_turns:
|
34 |
+
# We need both user and assistant messages to form a pair
|
35 |
+
if current_idx > 0 and conversation[current_idx]["role"] == "user" and conversation[current_idx-1]["role"] == "assistant":
|
36 |
+
# Add the pair to history
|
37 |
+
formatted_history.insert(0, conversation[current_idx-1]) # assistant
|
38 |
+
formatted_history.insert(1, conversation[current_idx]) # user
|
39 |
+
|
40 |
+
# Check token limit
|
41 |
+
try:
|
42 |
+
current_formatted = tokenizer.apply_chat_template(formatted_history, tokenize=False)
|
43 |
+
current_tokens = len(tokenizer.encode(current_formatted))
|
44 |
+
|
45 |
+
if current_tokens > max_history_tokens:
|
46 |
+
formatted_history = formatted_history[2:] # Remove the oldest pair
|
47 |
+
break
|
48 |
+
|
49 |
+
formatted_prompt = current_formatted
|
50 |
+
tokens = current_tokens
|
51 |
+
pairs += 1
|
52 |
+
current_idx -= 2
|
53 |
+
except Exception:
|
54 |
+
# If template application fails, remove the last added pair
|
55 |
+
formatted_history = formatted_history[2:]
|
56 |
+
break
|
57 |
+
else:
|
58 |
+
current_idx -= 1
|
59 |
+
|
60 |
+
# Add the final user message that prompted the rated response
|
61 |
+
if i > 0 and conversation[i-1]["role"] == "user":
|
62 |
+
last_history = formatted_history + [conversation[i-1]]
|
63 |
+
try:
|
64 |
+
formatted_prompt = tokenizer.apply_chat_template(last_history, tokenize=False)
|
65 |
+
except Exception:
|
66 |
+
# If template application fails, use the previous valid prompt
|
67 |
+
pass
|
68 |
|
69 |
data_points.append({
|
70 |
+
"prompt": formatted_prompt.strip(),
|
71 |
"completion": message["content"].strip(),
|
72 |
"label": message["rating"] == 1,
|
73 |
"timestamp": entry["timestamp"],
|
|
|
78 |
return data_points
|
79 |
|
80 |
def process_feel_dataset(
|
81 |
+
model_name: str = "CohereForAI/aya-expanse-8b",
|
82 |
max_history_turns: int = 10,
|
83 |
max_history_tokens: int = 4000
|
84 |
):
|
|
|
157 |
print("\nSample entries from processed KTO dataset:")
|
158 |
print("\n" + "="*80 + "\nTRAIN SET SAMPLES\n" + "="*80)
|
159 |
|
160 |
+
# Export datasets to CSV
|
161 |
+
train_df = datasets['train'].to_pandas()
|
162 |
+
test_df = datasets['test'].to_pandas()
|
163 |
+
|
164 |
+
train_df.to_csv('kto_train_dataset.csv', index=False)
|
165 |
+
test_df.to_csv('kto_test_dataset.csv', index=False)
|
166 |
+
|
167 |
+
print("\nDatasets exported to 'kto_train_dataset.csv' and 'kto_test_dataset.csv'")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|