Jen Ben Arye commited on
Commit
d151abe
·
1 Parent(s): 78757b7

added filtering by language

Browse files
Files changed (1) hide show
  1. ml/kto_dataset_processor.py +52 -9
ml/kto_dataset_processor.py CHANGED
@@ -4,7 +4,21 @@ from sklearn.model_selection import train_test_split
4
  import json
5
  from ipdb import set_trace as st
6
  from transformers import AutoTokenizer
7
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def transform_conversation(
10
  entry: dict,
@@ -72,12 +86,14 @@ def transform_conversation(
72
  "label": message["rating"] == 1,
73
  "timestamp": entry["timestamp"],
74
  "session_id": entry["session_id"],
75
- "conversation_id": entry["conversation_id"]
 
76
  })
77
 
78
  return data_points
79
 
80
  def process_feel_dataset(
 
81
  model_name: str = "CohereForAI/aya-expanse-8b",
82
  max_history_turns: int = 10,
83
  max_history_tokens: int = 4000
@@ -86,18 +102,43 @@ def process_feel_dataset(
86
  Processes the feel dataset into a format suitable for KTO training using TRL.
87
 
88
  Args:
 
89
  model_name: Name of the model to format for
90
  max_history_turns: Maximum number of previous turns to include in history
91
  max_history_tokens: Maximum number of tokens allowed in history
92
 
93
  Returns:
94
  dict: A dictionary containing the 'train' and 'test' splits of the dataset in KTO format
 
 
 
95
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  # Load feel dataset from HuggingFace
97
  feel_dataset = load_dataset("feel-fl/feel-feedback")["train"]
 
 
 
 
 
 
 
98
  kto_data = []
99
 
100
- # Process all conversations in the dataset
101
  for entry in feel_dataset:
102
  kto_data.extend(transform_conversation(
103
  entry,
@@ -106,6 +147,9 @@ def process_feel_dataset(
106
  max_history_tokens
107
  ))
108
 
 
 
 
109
  # Convert to DataFrame
110
  kto_df = pd.DataFrame(kto_data)
111
 
@@ -120,16 +164,15 @@ def process_feel_dataset(
120
  train_dataset = Dataset.from_pandas(train_df)
121
  test_dataset = Dataset.from_pandas(test_df)
122
 
 
 
 
 
123
  return {"train": train_dataset, "test": test_dataset}
124
 
125
  if __name__ == "__main__":
126
  # Process the dataset
127
- datasets = process_feel_dataset()
128
-
129
- # Print basic statistics
130
- print("\nDataset Statistics:")
131
- print(f"Train set size: {len(datasets['train'])}")
132
- print(f"Test set size: {len(datasets['test'])}")
133
 
134
  # Print distribution of positive/negative labels
135
  train_labels = datasets['train']['label']
 
4
  import json
5
  from ipdb import set_trace as st
6
  from transformers import AutoTokenizer
7
+ from enum import Enum
8
+
9
+ class SupportedLanguages(str, Enum):
10
+ """Enumeration of supported languages"""
11
+ ENGLISH = "English"
12
+ DUTCH = "Dutch"
13
+ ITALIAN = "Italian"
14
+ SPANISH = "Spanish"
15
+ FRENCH = "French"
16
+ GERMAN = "German"
17
+ PORTUGUESE = "Portuguese"
18
+ RUSSIAN = "Russian"
19
+ CHINESE = "Chinese"
20
+ JAPANESE = "Japanese"
21
+ KOREAN = "Korean"
22
 
23
  def transform_conversation(
24
  entry: dict,
 
86
  "label": message["rating"] == 1,
87
  "timestamp": entry["timestamp"],
88
  "session_id": entry["session_id"],
89
+ "conversation_id": entry["conversation_id"],
90
+ "language": entry["language"]
91
  })
92
 
93
  return data_points
94
 
95
  def process_feel_dataset(
96
+ language: str,
97
  model_name: str = "CohereForAI/aya-expanse-8b",
98
  max_history_turns: int = 10,
99
  max_history_tokens: int = 4000
 
102
  Processes the feel dataset into a format suitable for KTO training using TRL.
103
 
104
  Args:
105
+ language: Language to filter the dataset for (must be one of SupportedLanguages)
106
  model_name: Name of the model to format for
107
  max_history_turns: Maximum number of previous turns to include in history
108
  max_history_tokens: Maximum number of tokens allowed in history
109
 
110
  Returns:
111
  dict: A dictionary containing the 'train' and 'test' splits of the dataset in KTO format
112
+
113
+ Raises:
114
+ ValueError: If language is not provided or not in SupportedLanguages
115
  """
116
+ # Validate language
117
+ if not language:
118
+ raise ValueError("Language parameter is required")
119
+
120
+ try:
121
+ # Validate that it's a supported language
122
+ SupportedLanguages(language)
123
+ except ValueError:
124
+ supported_langs = "\n- ".join([lang.value for lang in SupportedLanguages])
125
+ raise ValueError(
126
+ f"Invalid language: '{language}'\n"
127
+ f"Supported languages are:\n- {supported_langs}"
128
+ )
129
+
130
  # Load feel dataset from HuggingFace
131
  feel_dataset = load_dataset("feel-fl/feel-feedback")["train"]
132
+
133
+ # Filter dataset by language
134
+ feel_dataset = feel_dataset.filter(lambda x: x["language"] == language)
135
+
136
+ if len(feel_dataset) == 0:
137
+ raise ValueError(f"No data found for language: {language}")
138
+
139
  kto_data = []
140
 
141
+ # Process all conversations in the filtered dataset
142
  for entry in feel_dataset:
143
  kto_data.extend(transform_conversation(
144
  entry,
 
147
  max_history_tokens
148
  ))
149
 
150
+ if len(kto_data) == 0:
151
+ raise ValueError(f"No valid training examples found for language: {language}")
152
+
153
  # Convert to DataFrame
154
  kto_df = pd.DataFrame(kto_data)
155
 
 
164
  train_dataset = Dataset.from_pandas(train_df)
165
  test_dataset = Dataset.from_pandas(test_df)
166
 
167
+ print(f"Processed {len(kto_data)} examples for language: {language}")
168
+ print(f"Train set size: {len(train_dataset)}")
169
+ print(f"Test set size: {len(test_dataset)}")
170
+
171
  return {"train": train_dataset, "test": test_dataset}
172
 
173
  if __name__ == "__main__":
174
  # Process the dataset
175
+ datasets = process_feel_dataset("English")
 
 
 
 
 
176
 
177
  # Print distribution of positive/negative labels
178
  train_labels = datasets['train']['label']