David Pomerenke
commited on
Commit
·
ce2acb0
1
Parent(s):
9e3bc4f
Add Global MMLU benchmark
Browse files- README.md +1 -0
- evals/backend.py +1 -1
- evals/main.py +4 -3
- evals/models.py +1 -1
- evals/tasks.py +41 -1
- results.json +0 -0
README.md
CHANGED
@@ -11,6 +11,7 @@ datasets:
|
|
11 |
- openlanguagedata/flores_plus
|
12 |
- google/fleurs
|
13 |
- mozilla-foundation/common_voice_1_0
|
|
|
14 |
models:
|
15 |
- meta-llama/Llama-3.3-70B-Instruct
|
16 |
- mistralai/Mistral-Small-24B-Instruct-2501
|
|
|
11 |
- openlanguagedata/flores_plus
|
12 |
- google/fleurs
|
13 |
- mozilla-foundation/common_voice_1_0
|
14 |
+
- CohereForAI/Global-MMLU
|
15 |
models:
|
16 |
- meta-llama/Llama-3.3-70B-Instruct
|
17 |
- mistralai/Mistral-Small-24B-Instruct-2501
|
evals/backend.py
CHANGED
@@ -22,7 +22,7 @@ def mean(lst):
|
|
22 |
return sum(lst) / len(lst) if lst else None
|
23 |
|
24 |
|
25 |
-
task_metrics = ["translation_from_bleu", "translation_to_bleu", "classification_accuracy"]
|
26 |
|
27 |
|
28 |
def make_model_table(df, models):
|
|
|
22 |
return sum(lst) / len(lst) if lst else None
|
23 |
|
24 |
|
25 |
+
task_metrics = ["translation_from_bleu", "translation_to_bleu", "classification_accuracy", "mmlu_accuracy"]
|
26 |
|
27 |
|
28 |
def make_model_table(df, models):
|
evals/main.py
CHANGED
@@ -12,6 +12,8 @@ from tasks import tasks
|
|
12 |
# ===== config =====
|
13 |
|
14 |
n_sentences = 10
|
|
|
|
|
15 |
|
16 |
# ===== run evaluation and aggregate results =====
|
17 |
|
@@ -22,9 +24,8 @@ async def evaluate():
|
|
22 |
task(model, lang.bcp_47, i)
|
23 |
for task in tasks
|
24 |
for i in range(n_sentences)
|
25 |
-
for lang in languages.iloc[:
|
26 |
-
for model in models["id"]
|
27 |
-
if lang.in_benchmark
|
28 |
]
|
29 |
return await tqdm_asyncio.gather(*results, miniters=1)
|
30 |
|
|
|
12 |
# ===== config =====
|
13 |
|
14 |
n_sentences = 10
|
15 |
+
n_languages = 3
|
16 |
+
n_models = 3
|
17 |
|
18 |
# ===== run evaluation and aggregate results =====
|
19 |
|
|
|
24 |
task(model, lang.bcp_47, i)
|
25 |
for task in tasks
|
26 |
for i in range(n_sentences)
|
27 |
+
for lang in languages.iloc[:n_languages].itertuples()
|
28 |
+
for model in models["id"].iloc[:n_models]
|
|
|
29 |
]
|
30 |
return await tqdm_asyncio.gather(*results, miniters=1)
|
31 |
|
evals/models.py
CHANGED
@@ -116,7 +116,7 @@ async def transcribe(path, model="elevenlabs/scribe_v1"):
|
|
116 |
raise ValueError(f"Model {model} not supported")
|
117 |
|
118 |
|
119 |
-
models = pd.DataFrame(models, columns=["id"])
|
120 |
|
121 |
|
122 |
@cache
|
|
|
116 |
raise ValueError(f"Model {model} not supported")
|
117 |
|
118 |
|
119 |
+
models = pd.DataFrame(models, columns=["id"])
|
120 |
|
121 |
|
122 |
@cache
|
evals/tasks.py
CHANGED
@@ -8,6 +8,7 @@ from datasets_.flores import flores_sentences
|
|
8 |
from joblib.memory import Memory
|
9 |
from languages import languages, script_name
|
10 |
from models import complete, transcribe
|
|
|
11 |
|
12 |
cache = Memory(location=".cache", verbose=0).cache
|
13 |
bleu = evaluate.load("bleu")
|
@@ -185,6 +186,45 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
|
|
185 |
}
|
186 |
]
|
187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
|
189 |
@cache
|
190 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
@@ -217,11 +257,11 @@ async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
|
217 |
}
|
218 |
]
|
219 |
|
220 |
-
|
221 |
tasks = [
|
222 |
partial(translate_and_evaluate, mode="from"),
|
223 |
partial(translate_and_evaluate, mode="to"),
|
224 |
classify_and_evaluate,
|
225 |
# mlm_and_evaluate,
|
|
|
226 |
# transcribe_and_evaluate,
|
227 |
]
|
|
|
8 |
from joblib.memory import Memory
|
9 |
from languages import languages, script_name
|
10 |
from models import complete, transcribe
|
11 |
+
from datasets import load_dataset
|
12 |
|
13 |
cache = Memory(location=".cache", verbose=0).cache
|
14 |
bleu = evaluate.load("bleu")
|
|
|
186 |
}
|
187 |
]
|
188 |
|
189 |
+
@cache
|
190 |
+
def _load_dataset(dataset, subset):
|
191 |
+
return load_dataset(dataset, subset)
|
192 |
+
|
193 |
+
@cache
|
194 |
+
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
195 |
+
data = _load_dataset("CohereForAI/Global-MMLU", language_bcp_47)
|
196 |
+
item = data["test"][nr]
|
197 |
+
def format_item(item):
|
198 |
+
return f"""{item['question']}
|
199 |
+
|
200 |
+
A: {item['option_a']}
|
201 |
+
B: {item['option_b']}
|
202 |
+
C: {item['option_c']}
|
203 |
+
D: {item['option_d']}
|
204 |
+
|
205 |
+
A|B|C|D?"""
|
206 |
+
messages = []
|
207 |
+
for example in data["dev"].select(range(5)):
|
208 |
+
messages += [{"role": "user", "content": format_item(example)}, {"role": "assistant", "content": example["answer"]}]
|
209 |
+
messages += [{"role": "user", "content": format_item(item)}]
|
210 |
+
reply = await complete(
|
211 |
+
model=model,
|
212 |
+
messages=messages,
|
213 |
+
temperature=0,
|
214 |
+
max_tokens=1,
|
215 |
+
)
|
216 |
+
print(reply.choices[0].message.content.strip())
|
217 |
+
acc = int(reply.choices[0].message.content.strip() == item["answer"])
|
218 |
+
return [
|
219 |
+
{
|
220 |
+
"model": model,
|
221 |
+
"bcp_47": language_bcp_47,
|
222 |
+
"task": "mmlu",
|
223 |
+
"metric": "accuracy",
|
224 |
+
"score": acc,
|
225 |
+
"sentence_nr": nr,
|
226 |
+
}
|
227 |
+
]
|
228 |
|
229 |
@cache
|
230 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
|
|
257 |
}
|
258 |
]
|
259 |
|
|
|
260 |
tasks = [
|
261 |
partial(translate_and_evaluate, mode="from"),
|
262 |
partial(translate_and_evaluate, mode="to"),
|
263 |
classify_and_evaluate,
|
264 |
# mlm_and_evaluate,
|
265 |
+
mmlu_and_evaluate,
|
266 |
# transcribe_and_evaluate,
|
267 |
]
|
results.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|