David Pomerenke commited on
Commit
a683732
·
1 Parent(s): 47170a5

Implement MMLU task

Browse files
datasets.json CHANGED
@@ -285,7 +285,7 @@
285
  "parallel": true,
286
  "translation": "machine",
287
  "base": "MMLU",
288
- "implemented": false,
289
  "group": "Multitask Language Understanding"
290
  },
291
  {
 
285
  "parallel": true,
286
  "translation": "machine",
287
  "base": "MMLU",
288
+ "implemented": true,
289
  "group": "Multitask Language Understanding"
290
  },
291
  {
evals/datasets_/mmlu.py CHANGED
@@ -1,5 +1,6 @@
1
- from collections import Counter, defaultdict
2
  import random
 
 
3
  from datasets import get_dataset_config_names, load_dataset
4
  from joblib.memory import Memory
5
  from langcodes import Language, standardize_tag
@@ -119,12 +120,30 @@ def print_datasets_analysis():
119
 
120
  # print_datasets_analysis()
121
 
122
- def load_mmlu(language_bcp_47, i):
123
- categories = sorted(list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"])))
124
- category = categories[i % len(categories)]
125
- random.seed(i)
126
- j = random.randint(0, 100)
127
- print(j)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  tags_afrimmlu = {
129
  standardize_tag(a, macro=True): a
130
  for a in _get_dataset_config_names("masakhane/afrimmlu")
@@ -140,21 +159,25 @@ def load_mmlu(language_bcp_47, i):
140
  )
141
  if language_bcp_47 in tags_afrimmlu:
142
  ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
143
- return ds["test"].filter(lambda x: x["subject"] == category)[j]
 
 
 
144
  elif language_bcp_47 in tags_global_mmlu:
145
  ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
146
- def add_choices(split):
147
- split["choices"] = list(zip([split["option_a"], split["option_b"], split["option_c"], split["option_d"]]))
148
- return split
149
  ds = ds.map(add_choices)
150
- return ds["test"].filter(lambda x: x["subject"] == category)[j]
 
 
151
  elif language_bcp_47 in tags_okapi:
152
  ds = _load_dataset(
153
  "lighteval/okapi_mmlu", language_bcp_47, trust_remote_code=True
154
  )
155
- return ds["test"].filter(lambda x: x["id"] == f"{category}/test/{j}")[0]
 
 
156
  elif language_bcp_47 in tags_mmlux:
157
  # loading this is more complicated, todo
158
- return None
159
  else:
160
- return None
 
 
1
  import random
2
+ from collections import Counter, defaultdict
3
+
4
  from datasets import get_dataset_config_names, load_dataset
5
  from joblib.memory import Memory
6
  from langcodes import Language, standardize_tag
 
120
 
121
  # print_datasets_analysis()
122
 
123
+
124
+ def parse_choices(row):
125
+ if not isinstance(row["choices"], list):
126
+ row["choices"] = eval(row["choices"])
127
+ return row
128
+
129
+
130
+ def add_choices(row):
131
+ row["choices"] = [
132
+ row["option_a"],
133
+ row["option_b"],
134
+ row["option_c"],
135
+ row["option_d"],
136
+ ]
137
+ return row
138
+
139
+
140
+ def load_mmlu(language_bcp_47, nr):
141
+ categories = sorted(
142
+ list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
143
+ )
144
+ category = categories[nr % len(categories)]
145
+ random.seed(nr)
146
+ i = random.randint(0, 100)
147
  tags_afrimmlu = {
148
  standardize_tag(a, macro=True): a
149
  for a in _get_dataset_config_names("masakhane/afrimmlu")
 
159
  )
160
  if language_bcp_47 in tags_afrimmlu:
161
  ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
162
+ ds = ds.map(parse_choices)
163
+ examples = ds["dev"].filter(lambda x: x["subject"] == category)
164
+ task = ds["test"].filter(lambda x: x["subject"] == category)[i]
165
+ return "masakhane/afrimmlu", examples, task
166
  elif language_bcp_47 in tags_global_mmlu:
167
  ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
 
 
 
168
  ds = ds.map(add_choices)
169
+ examples = ds["dev"].filter(lambda x: x["subject"] == category)
170
+ task = ds["test"].filter(lambda x: x["subject"] == category)[i]
171
+ return "CohereForAI/Global-MMLU", examples, task
172
  elif language_bcp_47 in tags_okapi:
173
  ds = _load_dataset(
174
  "lighteval/okapi_mmlu", language_bcp_47, trust_remote_code=True
175
  )
176
+ examples = ds["dev"].filter(lambda x: x["subject"] == category)
177
+ task = ds["test"].filter(lambda x: x["id"] == f"{category}/test/{i}")[0]
178
+ return "lighteval/okapi_mmlu", examples, task
179
  elif language_bcp_47 in tags_mmlux:
180
  # loading this is more complicated, todo
181
+ return None, None, None
182
  else:
183
+ return None, None, None
evals/main.py CHANGED
@@ -12,7 +12,7 @@ from tasks import tasks
12
  # ===== config =====
13
 
14
  n_sentences = 10
15
- n_languages = 3
16
  n_models = 3
17
 
18
  # ===== run evaluation and aggregate results =====
 
12
  # ===== config =====
13
 
14
  n_sentences = 10
15
+ n_languages = 10
16
  n_models = 3
17
 
18
  # ===== run evaluation and aggregate results =====
evals/tasks.py CHANGED
@@ -5,10 +5,10 @@ import evaluate
5
  import pandas as pd
6
  import sentencepiece as spm
7
  from datasets_.flores import flores_sentences
 
8
  from joblib.memory import Memory
9
  from languages import languages, script_name
10
  from models import complete, transcribe
11
- from datasets import load_dataset, get_dataset_config_names
12
 
13
  cache = Memory(location=".cache", verbose=0).cache
14
  bleu = evaluate.load("bleu")
@@ -187,47 +187,47 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
187
  ]
188
 
189
 
190
-
191
  @cache
192
  async def mmlu_and_evaluate(model, language_bcp_47, nr):
193
- item = data["test"][nr]
 
 
 
194
  def format_item(item):
195
- return f"""{item['question']}
196
 
197
- A: {item['option_a']}
198
- B: {item['option_b']}
199
- C: {item['option_c']}
200
- D: {item['option_d']}
201
 
202
  A|B|C|D?"""
 
203
  messages = []
204
- for example in data["dev"].select(range(5)):
205
- messages += [{"role": "user", "content": format_item(example)}, {"role": "assistant", "content": example["answer"]}]
206
- messages += [{"role": "user", "content": format_item(item)}]
 
 
 
207
  reply = await complete(
208
  model=model,
209
  messages=messages,
210
  temperature=0,
211
  max_tokens=1,
212
  )
213
- print(reply.choices[0].message.content.strip())
214
- acc = int(reply.choices[0].message.content.strip() == item["answer"])
215
  return [
216
  {
217
  "model": model,
218
  "bcp_47": language_bcp_47,
219
  "task": "mmlu",
220
- "dataset": ds,
221
  "metric": "accuracy",
222
  "score": acc,
223
  "sentence_nr": nr,
224
  }
225
  ]
226
 
227
- from asyncio import run
228
- results = run(mmlu_and_evaluate("gpt-4o-mini", "fr", 0))
229
- print(results)
230
- exit()
231
 
232
  @cache
233
  async def transcribe_and_evaluate(model, language_bcp_47, nr):
@@ -260,6 +260,7 @@ async def transcribe_and_evaluate(model, language_bcp_47, nr):
260
  }
261
  ]
262
 
 
263
  tasks = [
264
  partial(translate_and_evaluate, mode="from"),
265
  partial(translate_and_evaluate, mode="to"),
 
5
  import pandas as pd
6
  import sentencepiece as spm
7
  from datasets_.flores import flores_sentences
8
+ from datasets_.mmlu import load_mmlu
9
  from joblib.memory import Memory
10
  from languages import languages, script_name
11
  from models import complete, transcribe
 
12
 
13
  cache = Memory(location=".cache", verbose=0).cache
14
  bleu = evaluate.load("bleu")
 
187
  ]
188
 
189
 
 
190
  @cache
191
  async def mmlu_and_evaluate(model, language_bcp_47, nr):
192
+ ds_name, examples, task = load_mmlu(language_bcp_47, nr)
193
+ if not task:
194
+ return []
195
+
196
  def format_item(item):
197
+ return f"""{item["question"]}
198
 
199
+ A: {item["choices"][0]}
200
+ B: {item["choices"][1]}
201
+ C: {item["choices"][2]}
202
+ D: {item["choices"][3]}
203
 
204
  A|B|C|D?"""
205
+
206
  messages = []
207
+ for example in examples:
208
+ messages += [
209
+ {"role": "user", "content": format_item(example)},
210
+ {"role": "assistant", "content": example["answer"]},
211
+ ]
212
+ messages += [{"role": "user", "content": format_item(task)}]
213
  reply = await complete(
214
  model=model,
215
  messages=messages,
216
  temperature=0,
217
  max_tokens=1,
218
  )
219
+ acc = int(reply.choices[0].message.content[:1].strip() == task["answer"])
 
220
  return [
221
  {
222
  "model": model,
223
  "bcp_47": language_bcp_47,
224
  "task": "mmlu",
 
225
  "metric": "accuracy",
226
  "score": acc,
227
  "sentence_nr": nr,
228
  }
229
  ]
230
 
 
 
 
 
231
 
232
  @cache
233
  async def transcribe_and_evaluate(model, language_bcp_47, nr):
 
260
  }
261
  ]
262
 
263
+
264
  tasks = [
265
  partial(translate_and_evaluate, mode="from"),
266
  partial(translate_and_evaluate, mode="to"),
frontend/src/components/DatasetTable.js CHANGED
@@ -145,7 +145,7 @@ const DatasetTable = ({ data }) => {
145
  filter
146
  filterElement={tasksRowFilterTemplate}
147
  showFilterMatchModes={false}
148
- style={{ minWidth: '10rem', maxWidth: '15rem' }}
149
  body={tasksBodyTemplate}
150
  />
151
  <Column
 
145
  filter
146
  filterElement={tasksRowFilterTemplate}
147
  showFilterMatchModes={false}
148
+ style={{ minWidth: '10rem', maxWidth: '10rem' }}
149
  body={tasksBodyTemplate}
150
  />
151
  <Column
results.json CHANGED
The diff for this file is too large to render. See raw diff