Spaces:
Running
on
Zero
Running
on
Zero
Jen Ben Arye
commited on
Commit
·
d151abe
1
Parent(s):
78757b7
added filtering by language
Browse files- ml/kto_dataset_processor.py +52 -9
ml/kto_dataset_processor.py
CHANGED
@@ -4,7 +4,21 @@ from sklearn.model_selection import train_test_split
|
|
4 |
import json
|
5 |
from ipdb import set_trace as st
|
6 |
from transformers import AutoTokenizer
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
def transform_conversation(
|
10 |
entry: dict,
|
@@ -72,12 +86,14 @@ def transform_conversation(
|
|
72 |
"label": message["rating"] == 1,
|
73 |
"timestamp": entry["timestamp"],
|
74 |
"session_id": entry["session_id"],
|
75 |
-
"conversation_id": entry["conversation_id"]
|
|
|
76 |
})
|
77 |
|
78 |
return data_points
|
79 |
|
80 |
def process_feel_dataset(
|
|
|
81 |
model_name: str = "CohereForAI/aya-expanse-8b",
|
82 |
max_history_turns: int = 10,
|
83 |
max_history_tokens: int = 4000
|
@@ -86,18 +102,43 @@ def process_feel_dataset(
|
|
86 |
Processes the feel dataset into a format suitable for KTO training using TRL.
|
87 |
|
88 |
Args:
|
|
|
89 |
model_name: Name of the model to format for
|
90 |
max_history_turns: Maximum number of previous turns to include in history
|
91 |
max_history_tokens: Maximum number of tokens allowed in history
|
92 |
|
93 |
Returns:
|
94 |
dict: A dictionary containing the 'train' and 'test' splits of the dataset in KTO format
|
|
|
|
|
|
|
95 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
# Load feel dataset from HuggingFace
|
97 |
feel_dataset = load_dataset("feel-fl/feel-feedback")["train"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
kto_data = []
|
99 |
|
100 |
-
# Process all conversations in the dataset
|
101 |
for entry in feel_dataset:
|
102 |
kto_data.extend(transform_conversation(
|
103 |
entry,
|
@@ -106,6 +147,9 @@ def process_feel_dataset(
|
|
106 |
max_history_tokens
|
107 |
))
|
108 |
|
|
|
|
|
|
|
109 |
# Convert to DataFrame
|
110 |
kto_df = pd.DataFrame(kto_data)
|
111 |
|
@@ -120,16 +164,15 @@ def process_feel_dataset(
|
|
120 |
train_dataset = Dataset.from_pandas(train_df)
|
121 |
test_dataset = Dataset.from_pandas(test_df)
|
122 |
|
|
|
|
|
|
|
|
|
123 |
return {"train": train_dataset, "test": test_dataset}
|
124 |
|
125 |
if __name__ == "__main__":
|
126 |
# Process the dataset
|
127 |
-
datasets = process_feel_dataset()
|
128 |
-
|
129 |
-
# Print basic statistics
|
130 |
-
print("\nDataset Statistics:")
|
131 |
-
print(f"Train set size: {len(datasets['train'])}")
|
132 |
-
print(f"Test set size: {len(datasets['test'])}")
|
133 |
|
134 |
# Print distribution of positive/negative labels
|
135 |
train_labels = datasets['train']['label']
|
|
|
4 |
import json
|
5 |
from ipdb import set_trace as st
|
6 |
from transformers import AutoTokenizer
|
7 |
+
from enum import Enum
|
8 |
+
|
9 |
+
class SupportedLanguages(str, Enum):
|
10 |
+
"""Enumeration of supported languages"""
|
11 |
+
ENGLISH = "English"
|
12 |
+
DUTCH = "Dutch"
|
13 |
+
ITALIAN = "Italian"
|
14 |
+
SPANISH = "Spanish"
|
15 |
+
FRENCH = "French"
|
16 |
+
GERMAN = "German"
|
17 |
+
PORTUGUESE = "Portuguese"
|
18 |
+
RUSSIAN = "Russian"
|
19 |
+
CHINESE = "Chinese"
|
20 |
+
JAPANESE = "Japanese"
|
21 |
+
KOREAN = "Korean"
|
22 |
|
23 |
def transform_conversation(
|
24 |
entry: dict,
|
|
|
86 |
"label": message["rating"] == 1,
|
87 |
"timestamp": entry["timestamp"],
|
88 |
"session_id": entry["session_id"],
|
89 |
+
"conversation_id": entry["conversation_id"],
|
90 |
+
"language": entry["language"]
|
91 |
})
|
92 |
|
93 |
return data_points
|
94 |
|
95 |
def process_feel_dataset(
|
96 |
+
language: str,
|
97 |
model_name: str = "CohereForAI/aya-expanse-8b",
|
98 |
max_history_turns: int = 10,
|
99 |
max_history_tokens: int = 4000
|
|
|
102 |
Processes the feel dataset into a format suitable for KTO training using TRL.
|
103 |
|
104 |
Args:
|
105 |
+
language: Language to filter the dataset for (must be one of SupportedLanguages)
|
106 |
model_name: Name of the model to format for
|
107 |
max_history_turns: Maximum number of previous turns to include in history
|
108 |
max_history_tokens: Maximum number of tokens allowed in history
|
109 |
|
110 |
Returns:
|
111 |
dict: A dictionary containing the 'train' and 'test' splits of the dataset in KTO format
|
112 |
+
|
113 |
+
Raises:
|
114 |
+
ValueError: If language is not provided or not in SupportedLanguages
|
115 |
"""
|
116 |
+
# Validate language
|
117 |
+
if not language:
|
118 |
+
raise ValueError("Language parameter is required")
|
119 |
+
|
120 |
+
try:
|
121 |
+
# Validate that it's a supported language
|
122 |
+
SupportedLanguages(language)
|
123 |
+
except ValueError:
|
124 |
+
supported_langs = "\n- ".join([lang.value for lang in SupportedLanguages])
|
125 |
+
raise ValueError(
|
126 |
+
f"Invalid language: '{language}'\n"
|
127 |
+
f"Supported languages are:\n- {supported_langs}"
|
128 |
+
)
|
129 |
+
|
130 |
# Load feel dataset from HuggingFace
|
131 |
feel_dataset = load_dataset("feel-fl/feel-feedback")["train"]
|
132 |
+
|
133 |
+
# Filter dataset by language
|
134 |
+
feel_dataset = feel_dataset.filter(lambda x: x["language"] == language)
|
135 |
+
|
136 |
+
if len(feel_dataset) == 0:
|
137 |
+
raise ValueError(f"No data found for language: {language}")
|
138 |
+
|
139 |
kto_data = []
|
140 |
|
141 |
+
# Process all conversations in the filtered dataset
|
142 |
for entry in feel_dataset:
|
143 |
kto_data.extend(transform_conversation(
|
144 |
entry,
|
|
|
147 |
max_history_tokens
|
148 |
))
|
149 |
|
150 |
+
if len(kto_data) == 0:
|
151 |
+
raise ValueError(f"No valid training examples found for language: {language}")
|
152 |
+
|
153 |
# Convert to DataFrame
|
154 |
kto_df = pd.DataFrame(kto_data)
|
155 |
|
|
|
164 |
train_dataset = Dataset.from_pandas(train_df)
|
165 |
test_dataset = Dataset.from_pandas(test_df)
|
166 |
|
167 |
+
print(f"Processed {len(kto_data)} examples for language: {language}")
|
168 |
+
print(f"Train set size: {len(train_dataset)}")
|
169 |
+
print(f"Test set size: {len(test_dataset)}")
|
170 |
+
|
171 |
return {"train": train_dataset, "test": test_dataset}
|
172 |
|
173 |
if __name__ == "__main__":
|
174 |
# Process the dataset
|
175 |
+
datasets = process_feel_dataset("English")
|
|
|
|
|
|
|
|
|
|
|
176 |
|
177 |
# Print distribution of positive/negative labels
|
178 |
train_labels = datasets['train']['label']
|