David Pomerenke commited on
Commit
ce2acb0
·
1 Parent(s): 9e3bc4f

Add Global MMLU benchmark

Browse files
Files changed (6) hide show
  1. README.md +1 -0
  2. evals/backend.py +1 -1
  3. evals/main.py +4 -3
  4. evals/models.py +1 -1
  5. evals/tasks.py +41 -1
  6. results.json +0 -0
README.md CHANGED
@@ -11,6 +11,7 @@ datasets:
11
  - openlanguagedata/flores_plus
12
  - google/fleurs
13
  - mozilla-foundation/common_voice_1_0
 
14
  models:
15
  - meta-llama/Llama-3.3-70B-Instruct
16
  - mistralai/Mistral-Small-24B-Instruct-2501
 
11
  - openlanguagedata/flores_plus
12
  - google/fleurs
13
  - mozilla-foundation/common_voice_1_0
14
+ - CohereForAI/Global-MMLU
15
  models:
16
  - meta-llama/Llama-3.3-70B-Instruct
17
  - mistralai/Mistral-Small-24B-Instruct-2501
evals/backend.py CHANGED
@@ -22,7 +22,7 @@ def mean(lst):
22
  return sum(lst) / len(lst) if lst else None
23
 
24
 
25
- task_metrics = ["translation_from_bleu", "translation_to_bleu", "classification_accuracy"]
26
 
27
 
28
  def make_model_table(df, models):
 
22
  return sum(lst) / len(lst) if lst else None
23
 
24
 
25
+ task_metrics = ["translation_from_bleu", "translation_to_bleu", "classification_accuracy", "mmlu_accuracy"]
26
 
27
 
28
  def make_model_table(df, models):
evals/main.py CHANGED
@@ -12,6 +12,8 @@ from tasks import tasks
12
  # ===== config =====
13
 
14
  n_sentences = 10
 
 
15
 
16
  # ===== run evaluation and aggregate results =====
17
 
@@ -22,9 +24,8 @@ async def evaluate():
22
  task(model, lang.bcp_47, i)
23
  for task in tasks
24
  for i in range(n_sentences)
25
- for lang in languages.iloc[:100].itertuples()
26
- for model in models["id"]
27
- if lang.in_benchmark
28
  ]
29
  return await tqdm_asyncio.gather(*results, miniters=1)
30
 
 
12
  # ===== config =====
13
 
14
  n_sentences = 10
15
+ n_languages = 3
16
+ n_models = 3
17
 
18
  # ===== run evaluation and aggregate results =====
19
 
 
24
  task(model, lang.bcp_47, i)
25
  for task in tasks
26
  for i in range(n_sentences)
27
+ for lang in languages.iloc[:n_languages].itertuples()
28
+ for model in models["id"].iloc[:n_models]
 
29
  ]
30
  return await tqdm_asyncio.gather(*results, miniters=1)
31
 
evals/models.py CHANGED
@@ -116,7 +116,7 @@ async def transcribe(path, model="elevenlabs/scribe_v1"):
116
  raise ValueError(f"Model {model} not supported")
117
 
118
 
119
- models = pd.DataFrame(models, columns=["id"]).iloc[:3]
120
 
121
 
122
  @cache
 
116
  raise ValueError(f"Model {model} not supported")
117
 
118
 
119
+ models = pd.DataFrame(models, columns=["id"])
120
 
121
 
122
  @cache
evals/tasks.py CHANGED
@@ -8,6 +8,7 @@ from datasets_.flores import flores_sentences
8
  from joblib.memory import Memory
9
  from languages import languages, script_name
10
  from models import complete, transcribe
 
11
 
12
  cache = Memory(location=".cache", verbose=0).cache
13
  bleu = evaluate.load("bleu")
@@ -185,6 +186,45 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
185
  }
186
  ]
187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  @cache
190
  async def transcribe_and_evaluate(model, language_bcp_47, nr):
@@ -217,11 +257,11 @@ async def transcribe_and_evaluate(model, language_bcp_47, nr):
217
  }
218
  ]
219
 
220
-
221
  tasks = [
222
  partial(translate_and_evaluate, mode="from"),
223
  partial(translate_and_evaluate, mode="to"),
224
  classify_and_evaluate,
225
  # mlm_and_evaluate,
 
226
  # transcribe_and_evaluate,
227
  ]
 
8
  from joblib.memory import Memory
9
  from languages import languages, script_name
10
  from models import complete, transcribe
11
+ from datasets import load_dataset
12
 
13
  cache = Memory(location=".cache", verbose=0).cache
14
  bleu = evaluate.load("bleu")
 
186
  }
187
  ]
188
 
189
+ @cache
190
+ def _load_dataset(dataset, subset):
191
+ return load_dataset(dataset, subset)
192
+
193
+ @cache
194
+ async def mmlu_and_evaluate(model, language_bcp_47, nr):
195
+ data = _load_dataset("CohereForAI/Global-MMLU", language_bcp_47)
196
+ item = data["test"][nr]
197
+ def format_item(item):
198
+ return f"""{item['question']}
199
+
200
+ A: {item['option_a']}
201
+ B: {item['option_b']}
202
+ C: {item['option_c']}
203
+ D: {item['option_d']}
204
+
205
+ A|B|C|D?"""
206
+ messages = []
207
+ for example in data["dev"].select(range(5)):
208
+ messages += [{"role": "user", "content": format_item(example)}, {"role": "assistant", "content": example["answer"]}]
209
+ messages += [{"role": "user", "content": format_item(item)}]
210
+ reply = await complete(
211
+ model=model,
212
+ messages=messages,
213
+ temperature=0,
214
+ max_tokens=1,
215
+ )
216
+ print(reply.choices[0].message.content.strip())
217
+ acc = int(reply.choices[0].message.content.strip() == item["answer"])
218
+ return [
219
+ {
220
+ "model": model,
221
+ "bcp_47": language_bcp_47,
222
+ "task": "mmlu",
223
+ "metric": "accuracy",
224
+ "score": acc,
225
+ "sentence_nr": nr,
226
+ }
227
+ ]
228
 
229
  @cache
230
  async def transcribe_and_evaluate(model, language_bcp_47, nr):
 
257
  }
258
  ]
259
 
 
260
  tasks = [
261
  partial(translate_and_evaluate, mode="from"),
262
  partial(translate_and_evaluate, mode="to"),
263
  classify_and_evaluate,
264
  # mlm_and_evaluate,
265
+ mmlu_and_evaluate,
266
  # transcribe_and_evaluate,
267
  ]
results.json CHANGED
The diff for this file is too large to render. See raw diff