David Pomerenke
commited on
Commit
Β·
da6e1bc
1
Parent(s):
4718a02
Refactor eval code into files
Browse files- evals/__init__.py +1 -0
- app.py β evals/app.py +0 -0
- evals/datasets_/commonvoice.py +32 -0
- evals/datasets_/fleurs.py +37 -0
- evals/datasets_/flores.py +40 -0
- evals/languages.py +66 -0
- evals/main.py +110 -0
- evals/models.py +82 -0
- evals.py β evals/tasks.py +9 -346
- frontend/public/results.json +28 -18
- results.json +0 -0
evals/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
app.py β evals/app.py
RENAMED
File without changes
|
evals/datasets_/commonvoice.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from datetime import date
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
from joblib.memory import Memory
|
6 |
+
from langcodes import standardize_tag
|
7 |
+
from requests import get
|
8 |
+
|
9 |
+
cache = Memory(location=".cache", verbose=0).cache
|
10 |
+
|
11 |
+
|
12 |
+
# load CommonVoice stats
|
13 |
+
@cache # cache for 1 day
|
14 |
+
def get_commonvoice_stats(date: date):
|
15 |
+
return get("https://commonvoice.mozilla.org/api/v1/stats/languages").json()
|
16 |
+
|
17 |
+
|
18 |
+
commonvoice = pd.DataFrame(get_commonvoice_stats(date.today())).rename(
|
19 |
+
columns={"locale": "commonvoice_locale", "validatedHours": "commonvoice_hours"}
|
20 |
+
)[["commonvoice_locale", "commonvoice_hours"]]
|
21 |
+
# ignore country (language is language) (in practive this is only relevant to zh-CN/zh-TW/zh-HK)
|
22 |
+
commonvoice["bcp_47"] = commonvoice["commonvoice_locale"].apply(
|
23 |
+
lambda x: re.sub(r"-[A-Z]{2}$", "", x)
|
24 |
+
)
|
25 |
+
commonvoice["bcp_47"] = commonvoice["bcp_47"].apply(
|
26 |
+
lambda x: standardize_tag(x, macro=True)
|
27 |
+
) # this does not really seem to get macrolanguages though, e.g. not for Quechua
|
28 |
+
commonvoice = (
|
29 |
+
commonvoice.groupby("bcp_47")
|
30 |
+
.agg({"commonvoice_hours": "sum", "commonvoice_locale": "first"})
|
31 |
+
.reset_index()
|
32 |
+
)
|
evals/datasets_/fleurs.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from langcodes import standardize_tag
|
3 |
+
from pathlib import Path
|
4 |
+
import tarfile
|
5 |
+
import requests
|
6 |
+
|
7 |
+
fleurs_tags = "af_za,am_et,ar_eg,as_in,ast_es,az_az,be_by,bg_bg,bn_in,bs_ba,ca_es,ceb_ph,ckb_iq,cmn_hans_cn,cs_cz,cy_gb,da_dk,de_de,el_gr,en_us,es_419,et_ee,fa_ir,ff_sn,fi_fi,fil_ph,fr_fr,ga_ie,gl_es,gu_in,ha_ng,he_il,hi_in,hr_hr,hu_hu,hy_am,id_id,ig_ng,is_is,it_it,ja_jp,jv_id,ka_ge,kam_ke,kea_cv,kk_kz,km_kh,kn_in,ko_kr,ky_kg,lb_lu,lg_ug,ln_cd,lo_la,lt_lt,luo_ke,lv_lv,mi_nz,mk_mk,ml_in,mn_mn,mr_in,ms_my,mt_mt,my_mm,nb_no,ne_np,nl_nl,nso_za,ny_mw,oc_fr,om_et,or_in,pa_in,pl_pl,ps_af,pt_br,ro_ro,ru_ru,sd_in,sk_sk,sl_si,sn_zw,so_so,sr_rs,sv_se,sw_ke,ta_in,te_in,tg_tj,th_th,tr_tr,uk_ua,umb_ao,ur_pk,uz_uz,vi_vn,wo_sn,xh_za,yo_ng,yue_hant_hk,zu_za"
|
8 |
+
|
9 |
+
fleurs = pd.DataFrame(fleurs_tags.split(","), columns=["fleurs_tag"])
|
10 |
+
fleurs["bcp_47"] = fleurs["fleurs_tag"].apply(
|
11 |
+
lambda x: standardize_tag(x.rsplit("_")[0], macro=True)
|
12 |
+
)
|
13 |
+
|
14 |
+
def download_file(url, path):
|
15 |
+
response = requests.get(url)
|
16 |
+
with open(path, "wb") as f:
|
17 |
+
f.write(response.content)
|
18 |
+
|
19 |
+
|
20 |
+
def download_fleurs(transcription_langs_eval):
|
21 |
+
# the huggingface loader does not allow loading only the dev set, so do it manually
|
22 |
+
for language in transcription_langs_eval.itertuples():
|
23 |
+
tar_url = f"https://huggingface.co/datasets/google/fleurs/resolve/main/data/{language.fleurs_tag}/audio/dev.tar.gz"
|
24 |
+
tar_path = Path(f"data/fleurs/{language.fleurs_tag}/audio/dev.tar.gz")
|
25 |
+
audio_path = Path(f"data/fleurs/{language.fleurs_tag}/audio")
|
26 |
+
if not audio_path.exists():
|
27 |
+
print(f"Downloading {tar_url} to {tar_path}")
|
28 |
+
tar_path.parent.mkdir(parents=True, exist_ok=True)
|
29 |
+
download_file(tar_url, tar_path)
|
30 |
+
with tarfile.open(tar_path, "r:gz") as tar:
|
31 |
+
tar.extractall(path=audio_path)
|
32 |
+
tsv_url = f"https://huggingface.co/datasets/google/fleurs/resolve/main/data/{language.fleurs_tag}/dev.tsv"
|
33 |
+
tsv_path = Path(f"data/fleurs/{language.fleurs_tag}/dev.tsv")
|
34 |
+
if not tsv_path.exists():
|
35 |
+
print(f"Downloading {tsv_url} to {tsv_path}")
|
36 |
+
tsv_path.parent.mkdir(parents=True, exist_ok=True)
|
37 |
+
download_file(tsv_url, tsv_path)
|
evals/datasets_/flores.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langcodes import Language, standardize_tag
|
2 |
+
import pandas as pd
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
|
6 |
+
flores_dir = "data/floresp-v2.0-rc.3/dev"
|
7 |
+
|
8 |
+
def flores_sentences(language):
|
9 |
+
return open(f"{flores_dir}/dev.{language.flores_path}").readlines()
|
10 |
+
|
11 |
+
def aggregate_flores_paths(flores_paths):
|
12 |
+
# takes a list of paths from the same language but different scripts
|
13 |
+
# returns the one with the largest writing population
|
14 |
+
if len(flores_paths) == 1:
|
15 |
+
return flores_paths.values[0]
|
16 |
+
populations = [
|
17 |
+
Language.get(standardize_tag(x, macro=True)).writing_population()
|
18 |
+
for x in flores_paths.values
|
19 |
+
]
|
20 |
+
return flores_paths.values[populations.index(max(populations))]
|
21 |
+
|
22 |
+
|
23 |
+
|
24 |
+
flores = pd.DataFrame(
|
25 |
+
[f.split(".")[1] for f in os.listdir(flores_dir)],
|
26 |
+
columns=["flores_path"],
|
27 |
+
)
|
28 |
+
flores["bcp_47"] = flores["flores_path"].apply(
|
29 |
+
lambda x: standardize_tag(x, macro=True),
|
30 |
+
)
|
31 |
+
# ignore script (language is language)
|
32 |
+
flores["bcp_47"] = flores["bcp_47"].apply(
|
33 |
+
lambda x: re.sub(r"-[A-Z][a-z]+$", "", x)
|
34 |
+
)
|
35 |
+
flores = (
|
36 |
+
flores.groupby("bcp_47")
|
37 |
+
.agg({"flores_path": aggregate_flores_paths})
|
38 |
+
.reset_index()
|
39 |
+
)
|
40 |
+
|
evals/languages.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
from datasets_.commonvoice import commonvoice
|
5 |
+
from datasets_.fleurs import fleurs
|
6 |
+
from datasets_.flores import flores
|
7 |
+
from joblib.memory import Memory
|
8 |
+
from langcodes import Language, standardize_tag
|
9 |
+
from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
|
10 |
+
|
11 |
+
cache = Memory(location=".cache", verbose=0).cache
|
12 |
+
|
13 |
+
# load general language data
|
14 |
+
languages = {
|
15 |
+
lang: pop
|
16 |
+
for lang, pop in LANGUAGE_SPEAKING_POPULATION.items()
|
17 |
+
if not re.match(r".*-[A-Z]{2}$", lang)
|
18 |
+
}
|
19 |
+
languages = pd.DataFrame(list(languages.items()), columns=["bcp_47", "speakers"])
|
20 |
+
languages["language_name"] = languages["bcp_47"].apply(
|
21 |
+
lambda x: Language.get(x).display_name()
|
22 |
+
)
|
23 |
+
|
24 |
+
# load script codes and names
|
25 |
+
scripts = pd.read_csv("data/ScriptCodes.csv").rename(
|
26 |
+
columns={"Code": "iso15924", "English Name": "script_name"}
|
27 |
+
)
|
28 |
+
|
29 |
+
|
30 |
+
def population(bcp_47):
|
31 |
+
items = {
|
32 |
+
re.sub(r"^[a-z]+-", "", lang): pop
|
33 |
+
for lang, pop in LANGUAGE_SPEAKING_POPULATION.items()
|
34 |
+
if re.match(rf"^{bcp_47}-[A-Z]{{2}}$", lang)
|
35 |
+
}
|
36 |
+
return items
|
37 |
+
|
38 |
+
|
39 |
+
glottolog = pd.read_csv(
|
40 |
+
"data/glottolog_languoid.csv/languoid.csv", na_values=[""], keep_default_na=False
|
41 |
+
) # Min _Nan_ Chinese is not N/A!
|
42 |
+
glottolog["bcp_47"] = glottolog["iso639P3code"].apply(
|
43 |
+
lambda x: standardize_tag(x, macro=True) if not pd.isna(x) else None
|
44 |
+
)
|
45 |
+
|
46 |
+
|
47 |
+
@cache
|
48 |
+
def language_family(bcp_47):
|
49 |
+
languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
|
50 |
+
if pd.isna(languoid["family_id"]):
|
51 |
+
return None
|
52 |
+
family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
|
53 |
+
return family["name"]
|
54 |
+
|
55 |
+
|
56 |
+
def script_name(iso15924):
|
57 |
+
return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
|
58 |
+
|
59 |
+
|
60 |
+
# merge data
|
61 |
+
# always "left" because keep it simple for now
|
62 |
+
languages = pd.merge(languages, flores, on="bcp_47", how="left")
|
63 |
+
languages = pd.merge(languages, fleurs, on="bcp_47", how="left")
|
64 |
+
languages = pd.merge(languages, commonvoice, on="bcp_47", how="left")
|
65 |
+
languages["in_benchmark"] = languages["bcp_47"].isin(flores["bcp_47"])
|
66 |
+
languages = languages.sort_values(by="speakers", ascending=False)
|
evals/main.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import json
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
from rich import print
|
7 |
+
from tqdm.asyncio import tqdm_asyncio
|
8 |
+
from languages import languages
|
9 |
+
from tasks import tasks
|
10 |
+
from models import models, model_fast
|
11 |
+
|
12 |
+
# ===== config =====
|
13 |
+
|
14 |
+
n_sentences = 30
|
15 |
+
langs_eval = languages.iloc[:10]
|
16 |
+
langs_eval_detailed = languages.iloc[:2]
|
17 |
+
transcription_langs_eval = languages.iloc[:10]
|
18 |
+
transcription_langs_eval_detailed = languages.iloc[:5]
|
19 |
+
|
20 |
+
# ===== run evaluation and aggregate results =====
|
21 |
+
|
22 |
+
async def evaluate():
|
23 |
+
print("running evaluations")
|
24 |
+
results = [
|
25 |
+
task(model, original_language.bcp_47, i)
|
26 |
+
for task in tasks
|
27 |
+
for i in range(n_sentences)
|
28 |
+
for original_language in langs_eval.itertuples()
|
29 |
+
for model in models
|
30 |
+
if original_language.in_benchmark
|
31 |
+
and (
|
32 |
+
model == model_fast
|
33 |
+
or original_language.bcp_47 in langs_eval_detailed.bcp_47.values
|
34 |
+
)
|
35 |
+
]
|
36 |
+
return await tqdm_asyncio.gather(*results, miniters=1)
|
37 |
+
|
38 |
+
def aggregate(results):
|
39 |
+
results = pd.DataFrame([r for rs in results for r in rs])
|
40 |
+
results = (
|
41 |
+
results.groupby(["model", "bcp_47", "task", "metric"]).mean().reset_index()
|
42 |
+
)
|
43 |
+
lang_results = (
|
44 |
+
results.groupby(["bcp_47", "task", "metric"])
|
45 |
+
.agg({"score": "mean", "model": "nunique"})
|
46 |
+
.reset_index()
|
47 |
+
)
|
48 |
+
lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
|
49 |
+
model_results = (
|
50 |
+
results.groupby(["model", "task", "metric"])
|
51 |
+
.agg({"score": "mean", "bcp_47": "nunique"})
|
52 |
+
.reset_index()
|
53 |
+
)
|
54 |
+
task_results = (
|
55 |
+
results.groupby(["task", "metric"])
|
56 |
+
.agg({"score": "mean", "bcp_47": "nunique", "model": "nunique"})
|
57 |
+
.reset_index()
|
58 |
+
)
|
59 |
+
return results, lang_results, model_results, task_results
|
60 |
+
|
61 |
+
def mean(lst):
|
62 |
+
return sum(lst) / len(lst) if lst else None
|
63 |
+
|
64 |
+
|
65 |
+
def fmt_name(s):
|
66 |
+
return " ".join(w.capitalize() for w in s.split("-")).replace("Gpt", "GPT").replace("ai", "AI")
|
67 |
+
|
68 |
+
def serialize(df):
|
69 |
+
return df.replace({np.nan: None}).to_dict(orient="records")
|
70 |
+
|
71 |
+
def make_model_table(model_results):
|
72 |
+
model_results["task_metric"] = model_results["task"] + "_" + model_results["metric"]
|
73 |
+
model_results = model_results.drop(columns=["task", "metric"])
|
74 |
+
model_table = model_results.pivot(
|
75 |
+
index="model", columns="task_metric", values="score"
|
76 |
+
).fillna(0)
|
77 |
+
model_table["average"] = model_table.mean(axis=1)
|
78 |
+
model_table = model_table.sort_values(by="average", ascending=False)
|
79 |
+
model_table = model_table.round(2).reset_index()
|
80 |
+
model_table["provider"] = model_table["model"].str.split("/").str[0].apply(fmt_name)
|
81 |
+
model_table["model"] = model_table["model"].str.split("/").str[1].apply(fmt_name)
|
82 |
+
model_table["rank"] = model_table.index + 1
|
83 |
+
model_table = model_table[
|
84 |
+
["rank", "provider", "model", "average", *model_table.columns[1:-3]]
|
85 |
+
]
|
86 |
+
return model_table
|
87 |
+
|
88 |
+
|
89 |
+
async def main():
|
90 |
+
results = await evaluate()
|
91 |
+
results, lang_results, model_results, task_results = aggregate(results)
|
92 |
+
all_results = {
|
93 |
+
"tasks": serialize(task_results),
|
94 |
+
"models": serialize(model_results),
|
95 |
+
"languages": serialize(lang_results),
|
96 |
+
"scores": serialize(results),
|
97 |
+
}
|
98 |
+
with open("results.json", "w") as f:
|
99 |
+
json.dump(all_results, f, indent=2, ensure_ascii=False)
|
100 |
+
|
101 |
+
model_table = make_model_table(model_results)
|
102 |
+
all_tables = {
|
103 |
+
"model_table": serialize(model_table),
|
104 |
+
}
|
105 |
+
with open("frontend/public/results.json", "w") as f:
|
106 |
+
json.dump(all_tables, f, indent=2, ensure_ascii=False)
|
107 |
+
|
108 |
+
|
109 |
+
if __name__ == "__main__":
|
110 |
+
asyncio.run(main())
|
evals/models.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from os import getenv
|
2 |
+
|
3 |
+
from aiolimiter import AsyncLimiter
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from elevenlabs import AsyncElevenLabs
|
6 |
+
from huggingface_hub import AsyncInferenceClient
|
7 |
+
from joblib.memory import Memory
|
8 |
+
from openai import AsyncOpenAI
|
9 |
+
|
10 |
+
# for development purposes, all languages will be evaluated on the fast models
|
11 |
+
# and only a sample of languages will be evaluated on all models
|
12 |
+
models = [
|
13 |
+
"openai/gpt-4o-mini", # 0.6$/M tokens
|
14 |
+
# "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive for dev
|
15 |
+
"meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
|
16 |
+
"mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
|
17 |
+
"google/gemini-2.0-flash-001", # 0.4$/M tokens
|
18 |
+
# "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
|
19 |
+
# "deepseek/deepseek-chat", # 0.9$/M tokens
|
20 |
+
# "microsoft/phi-4", # 0.07$/M tokens; only 16k tokens context
|
21 |
+
"google/gemma-3-27b-it", # 0.2$/M tokens
|
22 |
+
]
|
23 |
+
model_fast = "meta-llama/llama-3.3-70b-instruct"
|
24 |
+
|
25 |
+
transcription_models = [
|
26 |
+
"elevenlabs/scribe_v1",
|
27 |
+
"openai/whisper-large-v3",
|
28 |
+
# "openai/whisper-small",
|
29 |
+
# "facebook/seamless-m4t-v2-large",
|
30 |
+
]
|
31 |
+
transcription_model_fast = "elevenlabs/scribe_v1"
|
32 |
+
|
33 |
+
load_dotenv()
|
34 |
+
client = AsyncOpenAI(
|
35 |
+
base_url="https://openrouter.ai/api/v1",
|
36 |
+
api_key=getenv("OPENROUTER_API_KEY"),
|
37 |
+
)
|
38 |
+
|
39 |
+
cache = Memory(location=".cache", verbose=0).cache
|
40 |
+
openrouter_rate_limit = AsyncLimiter(max_rate=20, time_period=1)
|
41 |
+
elevenlabs_rate_limit = AsyncLimiter(max_rate=2, time_period=1)
|
42 |
+
huggingface_rate_limit = AsyncLimiter(max_rate=5, time_period=1)
|
43 |
+
|
44 |
+
|
45 |
+
@cache
|
46 |
+
async def complete(**kwargs):
|
47 |
+
async with openrouter_rate_limit:
|
48 |
+
response = await client.chat.completions.create(**kwargs)
|
49 |
+
if not response.choices:
|
50 |
+
raise Exception(response)
|
51 |
+
return response
|
52 |
+
|
53 |
+
|
54 |
+
@cache
|
55 |
+
async def transcribe_elevenlabs(path, model):
|
56 |
+
modelname = model.split("/")[-1]
|
57 |
+
client = AsyncElevenLabs(api_key=getenv("ELEVENLABS_API_KEY"))
|
58 |
+
async with elevenlabs_rate_limit:
|
59 |
+
with open(path, "rb") as file:
|
60 |
+
response = await client.speech_to_text.convert(
|
61 |
+
model_id=modelname, file=file
|
62 |
+
)
|
63 |
+
return response.text
|
64 |
+
|
65 |
+
|
66 |
+
@cache
|
67 |
+
async def transcribe_huggingface(path, model):
|
68 |
+
client = AsyncInferenceClient(api_key=getenv("HUGGINGFACE_ACCESS_TOKEN"))
|
69 |
+
async with huggingface_rate_limit:
|
70 |
+
output = await client.automatic_speech_recognition(model=model, audio=path)
|
71 |
+
return output.text
|
72 |
+
|
73 |
+
|
74 |
+
async def transcribe(path, model="elevenlabs/scribe_v1"):
|
75 |
+
provider, modelname = model.split("/")
|
76 |
+
match provider:
|
77 |
+
case "elevenlabs":
|
78 |
+
return await transcribe_elevenlabs(path, modelname)
|
79 |
+
case "openai" | "facebook":
|
80 |
+
return await transcribe_huggingface(path, model)
|
81 |
+
case _:
|
82 |
+
raise ValueError(f"Model {model} not supported")
|
evals.py β evals/tasks.py
RENAMED
@@ -1,249 +1,23 @@
|
|
1 |
-
import asyncio
|
2 |
-
import json
|
3 |
-
import os
|
4 |
import random
|
5 |
-
import re
|
6 |
-
import tarfile
|
7 |
-
from datetime import date
|
8 |
-
from os import getenv
|
9 |
-
from pathlib import Path
|
10 |
|
11 |
import evaluate
|
12 |
-
import numpy as np
|
13 |
import pandas as pd
|
14 |
-
import requests
|
15 |
-
from aiolimiter import AsyncLimiter
|
16 |
-
from dotenv import load_dotenv
|
17 |
-
from elevenlabs import AsyncElevenLabs
|
18 |
-
from huggingface_hub import AsyncInferenceClient
|
19 |
from joblib.memory import Memory
|
20 |
-
from langcodes import Language, standardize_tag
|
21 |
-
from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
|
22 |
-
from openai import AsyncOpenAI
|
23 |
-
from requests import get
|
24 |
-
from rich import print
|
25 |
-
from tqdm.asyncio import tqdm_asyncio
|
26 |
from transformers import NllbTokenizer
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
# for development purposes, all languages will be evaluated on the fast models
|
31 |
-
# and only a sample of languages will be evaluated on all models
|
32 |
-
models = [
|
33 |
-
"openai/gpt-4o-mini", # 0.6$/M tokens
|
34 |
-
# "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive for dev
|
35 |
-
"meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
|
36 |
-
"mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
|
37 |
-
"google/gemini-2.0-flash-001", # 0.4$/M tokens
|
38 |
-
# "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
|
39 |
-
# "deepseek/deepseek-chat", # 0.9$/M tokens
|
40 |
-
# "microsoft/phi-4", # 0.07$/M tokens; only 16k tokens context
|
41 |
-
]
|
42 |
-
model_fast = "meta-llama/llama-3.3-70b-instruct"
|
43 |
-
n_languages = 50
|
44 |
-
n_detailed_languages = 10
|
45 |
-
n_sentences = 30
|
46 |
-
|
47 |
-
transcription_models = [
|
48 |
-
"elevenlabs/scribe_v1",
|
49 |
-
"openai/whisper-large-v3",
|
50 |
-
# "openai/whisper-small",
|
51 |
-
# "facebook/seamless-m4t-v2-large",
|
52 |
-
]
|
53 |
-
transcription_model_fast = "elevenlabs/scribe_v1"
|
54 |
-
transcription_n_languages = 10
|
55 |
-
transcription_n_detailed_languages = 5
|
56 |
-
|
57 |
-
# ===== setup =====
|
58 |
-
|
59 |
-
load_dotenv()
|
60 |
-
client = AsyncOpenAI(
|
61 |
-
base_url="https://openrouter.ai/api/v1",
|
62 |
-
api_key=getenv("OPENROUTER_API_KEY"),
|
63 |
-
)
|
64 |
cache = Memory(location=".cache", verbose=0).cache
|
65 |
bleu = evaluate.load("bleu")
|
66 |
chrf = evaluate.load("chrf")
|
67 |
wer = evaluate.load("wer")
|
68 |
tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
|
69 |
-
openrouter_rate_limit = AsyncLimiter(max_rate=20, time_period=1)
|
70 |
-
elevenlabs_rate_limit = AsyncLimiter(max_rate=2, time_period=1)
|
71 |
-
huggingface_rate_limit = AsyncLimiter(max_rate=5, time_period=1)
|
72 |
-
|
73 |
-
# ===== load metadata =====
|
74 |
-
|
75 |
-
# load general language data
|
76 |
-
languages = {
|
77 |
-
lang: pop
|
78 |
-
for lang, pop in LANGUAGE_SPEAKING_POPULATION.items()
|
79 |
-
if not re.match(r".*-[A-Z]{2}$", lang)
|
80 |
-
}
|
81 |
-
languages = pd.DataFrame(list(languages.items()), columns=["bcp_47", "speakers"])
|
82 |
-
languages["language_name"] = languages["bcp_47"].apply(
|
83 |
-
lambda x: Language.get(x).display_name()
|
84 |
-
)
|
85 |
-
|
86 |
-
# load script codes and names
|
87 |
-
scripts = pd.read_csv("data/ScriptCodes.csv").rename(
|
88 |
-
columns={"Code": "iso15924", "English Name": "script_name"}
|
89 |
-
)
|
90 |
-
|
91 |
-
|
92 |
-
def population(bcp_47):
|
93 |
-
items = {
|
94 |
-
re.sub(r"^[a-z]+-", "", lang): pop
|
95 |
-
for lang, pop in LANGUAGE_SPEAKING_POPULATION.items()
|
96 |
-
if re.match(rf"^{bcp_47}-[A-Z]{{2}}$", lang)
|
97 |
-
}
|
98 |
-
return items
|
99 |
-
|
100 |
-
|
101 |
-
glottolog = pd.read_csv(
|
102 |
-
"data/glottolog_languoid.csv/languoid.csv", na_values=[""], keep_default_na=False
|
103 |
-
) # Min _Nan_ Chinese is not N/A!
|
104 |
-
glottolog["bcp_47"] = glottolog["iso639P3code"].apply(
|
105 |
-
lambda x: standardize_tag(x, macro=True) if not pd.isna(x) else None
|
106 |
-
)
|
107 |
-
|
108 |
-
|
109 |
-
@cache
|
110 |
-
def language_family(bcp_47):
|
111 |
-
languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
|
112 |
-
if pd.isna(languoid["family_id"]):
|
113 |
-
return None
|
114 |
-
family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
|
115 |
-
return family["name"]
|
116 |
-
|
117 |
-
|
118 |
-
def script_name(iso15924):
|
119 |
-
return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
|
120 |
-
|
121 |
-
|
122 |
-
def aggregate_flores_paths(flores_paths):
|
123 |
-
# takes a list of paths from the same language but different scripts
|
124 |
-
# returns the one with the largest writing population
|
125 |
-
if len(flores_paths) == 1:
|
126 |
-
return flores_paths.values[0]
|
127 |
-
populations = [
|
128 |
-
Language.get(standardize_tag(x, macro=True)).writing_population()
|
129 |
-
for x in flores_paths.values
|
130 |
-
]
|
131 |
-
return flores_paths.values[populations.index(max(populations))]
|
132 |
-
|
133 |
-
|
134 |
-
# load benchmark languages and scripts
|
135 |
-
benchmark_dir = "data/floresp-v2.0-rc.3/dev"
|
136 |
-
benchmark_languages = pd.DataFrame(
|
137 |
-
[f.split(".")[1] for f in os.listdir(benchmark_dir)],
|
138 |
-
columns=["flores_path"],
|
139 |
-
)
|
140 |
-
benchmark_languages["bcp_47"] = benchmark_languages["flores_path"].apply(
|
141 |
-
lambda x: standardize_tag(x, macro=True),
|
142 |
-
)
|
143 |
-
# ignore script (language is language)
|
144 |
-
benchmark_languages["bcp_47"] = benchmark_languages["bcp_47"].apply(
|
145 |
-
lambda x: re.sub(r"-[A-Z][a-z]+$", "", x)
|
146 |
-
)
|
147 |
-
benchmark_languages = (
|
148 |
-
benchmark_languages.groupby("bcp_47")
|
149 |
-
.agg({"flores_path": aggregate_flores_paths})
|
150 |
-
.reset_index()
|
151 |
-
)
|
152 |
-
|
153 |
-
fleurs_tags = "af_za,am_et,ar_eg,as_in,ast_es,az_az,be_by,bg_bg,bn_in,bs_ba,ca_es,ceb_ph,ckb_iq,cmn_hans_cn,cs_cz,cy_gb,da_dk,de_de,el_gr,en_us,es_419,et_ee,fa_ir,ff_sn,fi_fi,fil_ph,fr_fr,ga_ie,gl_es,gu_in,ha_ng,he_il,hi_in,hr_hr,hu_hu,hy_am,id_id,ig_ng,is_is,it_it,ja_jp,jv_id,ka_ge,kam_ke,kea_cv,kk_kz,km_kh,kn_in,ko_kr,ky_kg,lb_lu,lg_ug,ln_cd,lo_la,lt_lt,luo_ke,lv_lv,mi_nz,mk_mk,ml_in,mn_mn,mr_in,ms_my,mt_mt,my_mm,nb_no,ne_np,nl_nl,nso_za,ny_mw,oc_fr,om_et,or_in,pa_in,pl_pl,ps_af,pt_br,ro_ro,ru_ru,sd_in,sk_sk,sl_si,sn_zw,so_so,sr_rs,sv_se,sw_ke,ta_in,te_in,tg_tj,th_th,tr_tr,uk_ua,umb_ao,ur_pk,uz_uz,vi_vn,wo_sn,xh_za,yo_ng,yue_hant_hk,zu_za"
|
154 |
-
fleurs = pd.DataFrame(fleurs_tags.split(","), columns=["fleurs_tag"])
|
155 |
-
fleurs["bcp_47"] = fleurs["fleurs_tag"].apply(
|
156 |
-
lambda x: standardize_tag(x.rsplit("_")[0], macro=True)
|
157 |
-
)
|
158 |
-
|
159 |
-
|
160 |
-
# load CommonVoice stats
|
161 |
-
@cache # cache for 1 day
|
162 |
-
def get_commonvoice_stats(date: date):
|
163 |
-
return get("https://commonvoice.mozilla.org/api/v1/stats/languages").json()
|
164 |
-
|
165 |
-
|
166 |
-
commonvoice_stats = pd.DataFrame(get_commonvoice_stats(date.today())).rename(
|
167 |
-
columns={"locale": "commonvoice_locale", "validatedHours": "commonvoice_hours"}
|
168 |
-
)[["commonvoice_locale", "commonvoice_hours"]]
|
169 |
-
# ignore country (language is language) (in practive this is only relevant to zh-CN/zh-TW/zh-HK)
|
170 |
-
commonvoice_stats["bcp_47"] = commonvoice_stats["commonvoice_locale"].apply(
|
171 |
-
lambda x: re.sub(r"-[A-Z]{2}$", "", x)
|
172 |
-
)
|
173 |
-
commonvoice_stats["bcp_47"] = commonvoice_stats["bcp_47"].apply(
|
174 |
-
lambda x: standardize_tag(x, macro=True)
|
175 |
-
) # this does not really seem to get macrolanguages though, e.g. not for Quechua
|
176 |
-
commonvoice_stats = (
|
177 |
-
commonvoice_stats.groupby("bcp_47")
|
178 |
-
.agg({"commonvoice_hours": "sum", "commonvoice_locale": "first"})
|
179 |
-
.reset_index()
|
180 |
-
)
|
181 |
|
182 |
-
# merge data
|
183 |
-
languages = pd.merge(
|
184 |
-
languages, benchmark_languages, on="bcp_47", how="left"
|
185 |
-
) # "left" because keep it simple for now
|
186 |
-
languages = pd.merge(
|
187 |
-
languages, fleurs, on="bcp_47", how="left"
|
188 |
-
) # "left" because keep it simple for now
|
189 |
-
languages = pd.merge(
|
190 |
-
languages, commonvoice_stats, on="bcp_47", how="left"
|
191 |
-
) # "left" because keep it simple for now
|
192 |
-
languages["in_benchmark"] = languages["bcp_47"].isin(benchmark_languages["bcp_47"])
|
193 |
-
|
194 |
-
languages = languages.sort_values(by="speakers", ascending=False)
|
195 |
|
196 |
# sample languages to translate to
|
197 |
target_languages = languages[languages["in_benchmark"]].sample(
|
198 |
-
|
199 |
)
|
200 |
-
langs_eval = languages.iloc[:n_languages]
|
201 |
-
langs_eval_detailed = languages.iloc[:n_detailed_languages]
|
202 |
-
transcription_langs_eval = languages.iloc[:transcription_n_languages]
|
203 |
-
transcription_langs_eval_detailed = languages.iloc[:transcription_n_detailed_languages]
|
204 |
-
|
205 |
-
|
206 |
-
def download_file(url, path):
|
207 |
-
response = requests.get(url)
|
208 |
-
with open(path, "wb") as f:
|
209 |
-
f.write(response.content)
|
210 |
-
|
211 |
-
|
212 |
-
def download_fleurs():
|
213 |
-
# the huggingface loader does not allow loading only the dev set, so do it manually
|
214 |
-
for language in transcription_langs_eval.itertuples():
|
215 |
-
tar_url = f"https://huggingface.co/datasets/google/fleurs/resolve/main/data/{language.fleurs_tag}/audio/dev.tar.gz"
|
216 |
-
tar_path = Path(f"data/fleurs/{language.fleurs_tag}/audio/dev.tar.gz")
|
217 |
-
audio_path = Path(f"data/fleurs/{language.fleurs_tag}/audio")
|
218 |
-
if not audio_path.exists():
|
219 |
-
print(f"Downloading {tar_url} to {tar_path}")
|
220 |
-
tar_path.parent.mkdir(parents=True, exist_ok=True)
|
221 |
-
download_file(tar_url, tar_path)
|
222 |
-
with tarfile.open(tar_path, "r:gz") as tar:
|
223 |
-
tar.extractall(path=audio_path)
|
224 |
-
tsv_url = f"https://huggingface.co/datasets/google/fleurs/resolve/main/data/{language.fleurs_tag}/dev.tsv"
|
225 |
-
tsv_path = Path(f"data/fleurs/{language.fleurs_tag}/dev.tsv")
|
226 |
-
if not tsv_path.exists():
|
227 |
-
print(f"Downloading {tsv_url} to {tsv_path}")
|
228 |
-
tsv_path.parent.mkdir(parents=True, exist_ok=True)
|
229 |
-
download_file(tsv_url, tsv_path)
|
230 |
-
|
231 |
-
|
232 |
-
# ===== define tasks and metrics =====
|
233 |
-
|
234 |
-
|
235 |
-
@cache
|
236 |
-
async def complete(**kwargs):
|
237 |
-
async with openrouter_rate_limit:
|
238 |
-
response = await client.chat.completions.create(**kwargs)
|
239 |
-
if not response.choices:
|
240 |
-
raise Exception(response)
|
241 |
-
return response
|
242 |
-
|
243 |
-
|
244 |
-
def load_sentences(language):
|
245 |
-
return open(f"{benchmark_dir}/dev.{language.flores_path}").readlines()
|
246 |
-
|
247 |
|
248 |
@cache
|
249 |
async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
|
@@ -251,8 +25,8 @@ async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
|
|
251 |
0
|
252 |
]
|
253 |
target_language = target_languages.iloc[sentence_nr]
|
254 |
-
original_sentence =
|
255 |
-
target_sentence =
|
256 |
script = script_name(target_language.flores_path.split("_")[1])
|
257 |
reply = await complete(
|
258 |
model=model,
|
@@ -296,7 +70,7 @@ metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
|
|
296 |
@cache
|
297 |
async def classify_and_evaluate(model, language_bcp_47, nr):
|
298 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
299 |
-
sentences = pd.DataFrame(
|
300 |
sentences = pd.concat([metadata, sentences], axis=1)
|
301 |
sentences = sentences.dropna(subset=["topic"])
|
302 |
sentences["topic"] = sentences["topic"].str.lower()
|
@@ -365,7 +139,7 @@ def corrupt_sentence(sentence):
|
|
365 |
@cache
|
366 |
async def mlm_and_evaluate(model, language_bcp_47, nr):
|
367 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
368 |
-
sentences = pd.DataFrame(
|
369 |
sentences["corrupt_text"] = sentences["text"].apply(corrupt_sentence)
|
370 |
examples = sentences.sample(n=10, random_state=42)
|
371 |
test_sentences = sentences[~sentences["text"].isin(examples["text"])].sample(
|
@@ -403,38 +177,6 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
|
|
403 |
}
|
404 |
]
|
405 |
|
406 |
-
|
407 |
-
@cache
|
408 |
-
async def transcribe_elevenlabs(path, model):
|
409 |
-
modelname = model.split("/")[-1]
|
410 |
-
client = AsyncElevenLabs(api_key=getenv("ELEVENLABS_API_KEY"))
|
411 |
-
async with elevenlabs_rate_limit:
|
412 |
-
with open(path, "rb") as file:
|
413 |
-
response = await client.speech_to_text.convert(
|
414 |
-
model_id=modelname, file=file
|
415 |
-
)
|
416 |
-
return response.text
|
417 |
-
|
418 |
-
|
419 |
-
@cache
|
420 |
-
async def transcribe_huggingface(path, model):
|
421 |
-
client = AsyncInferenceClient(api_key=getenv("HUGGINGFACE_ACCESS_TOKEN"))
|
422 |
-
async with huggingface_rate_limit:
|
423 |
-
output = await client.automatic_speech_recognition(model=model, audio=path)
|
424 |
-
return output.text
|
425 |
-
|
426 |
-
|
427 |
-
async def transcribe(path, model="elevenlabs/scribe_v1"):
|
428 |
-
provider, modelname = model.split("/")
|
429 |
-
match provider:
|
430 |
-
case "elevenlabs":
|
431 |
-
return await transcribe_elevenlabs(path, modelname)
|
432 |
-
case "openai" | "facebook":
|
433 |
-
return await transcribe_huggingface(path, model)
|
434 |
-
case _:
|
435 |
-
raise ValueError(f"Model {model} not supported")
|
436 |
-
|
437 |
-
|
438 |
@cache
|
439 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
440 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
@@ -472,83 +214,4 @@ tasks = [
|
|
472 |
classify_and_evaluate,
|
473 |
mlm_and_evaluate,
|
474 |
# transcribe_and_evaluate,
|
475 |
-
]
|
476 |
-
|
477 |
-
# ===== run evaluation and aggregate results =====
|
478 |
-
|
479 |
-
|
480 |
-
def mean(lst):
|
481 |
-
return sum(lst) / len(lst) if lst else None
|
482 |
-
|
483 |
-
|
484 |
-
def fmt_name(s):
|
485 |
-
return " ".join(w.capitalize() for w in s.split("-")).replace("Gpt", "GPT").replace("ai", "AI")
|
486 |
-
|
487 |
-
|
488 |
-
async def main():
|
489 |
-
print("running evaluations")
|
490 |
-
results = [
|
491 |
-
task(model, original_language.bcp_47, i)
|
492 |
-
for task in tasks
|
493 |
-
for i in range(n_sentences)
|
494 |
-
for original_language in langs_eval.itertuples()
|
495 |
-
for model in models
|
496 |
-
if original_language.in_benchmark
|
497 |
-
and (
|
498 |
-
model == model_fast
|
499 |
-
or original_language.bcp_47 in langs_eval_detailed.bcp_47.values
|
500 |
-
)
|
501 |
-
]
|
502 |
-
results = await tqdm_asyncio.gather(*results, miniters=1)
|
503 |
-
results = pd.DataFrame([r for rs in results for r in rs])
|
504 |
-
results = (
|
505 |
-
results.groupby(["model", "bcp_47", "task", "metric"]).mean().reset_index()
|
506 |
-
)
|
507 |
-
lang_results = (
|
508 |
-
results.groupby(["bcp_47", "task", "metric"])
|
509 |
-
.agg({"score": "mean", "model": "nunique"})
|
510 |
-
.reset_index()
|
511 |
-
)
|
512 |
-
lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
|
513 |
-
model_results = (
|
514 |
-
results.groupby(["model", "task", "metric"])
|
515 |
-
.agg({"score": "mean", "bcp_47": "nunique"})
|
516 |
-
.reset_index()
|
517 |
-
)
|
518 |
-
task_results = (
|
519 |
-
results.groupby(["task", "metric"])
|
520 |
-
.agg({"score": "mean", "bcp_47": "nunique", "model": "nunique"})
|
521 |
-
.reset_index()
|
522 |
-
)
|
523 |
-
all_results = {
|
524 |
-
"tasks": task_results.replace({np.nan: None}).to_dict(orient="records"),
|
525 |
-
"models": model_results.replace({np.nan: None}).to_dict(orient="records"),
|
526 |
-
"languages": lang_results.replace({np.nan: None}).to_dict(orient="records"),
|
527 |
-
"scores": results.replace({np.nan: None}).to_dict(orient="records"),
|
528 |
-
}
|
529 |
-
with open("results.json", "w") as f:
|
530 |
-
json.dump(all_results, f, indent=2, ensure_ascii=False)
|
531 |
-
model_results["task_metric"] = model_results["task"] + "_" + model_results["metric"]
|
532 |
-
model_results = model_results.drop(columns=["task", "metric"])
|
533 |
-
model_table = model_results.pivot(
|
534 |
-
index="model", columns="task_metric", values="score"
|
535 |
-
).fillna(0)
|
536 |
-
model_table["average"] = model_table.mean(axis=1)
|
537 |
-
model_table = model_table.sort_values(by="average", ascending=False)
|
538 |
-
model_table = model_table.round(2).reset_index()
|
539 |
-
model_table["provider"] = model_table["model"].str.split("/").str[0].apply(fmt_name)
|
540 |
-
model_table["model"] = model_table["model"].str.split("/").str[1].apply(fmt_name)
|
541 |
-
model_table["rank"] = model_table.index + 1
|
542 |
-
model_table = model_table[
|
543 |
-
["rank", "provider", "model", "average", *model_table.columns[1:-3]]
|
544 |
-
]
|
545 |
-
all_tables = {
|
546 |
-
"model_table": model_table.to_dict(orient="records"),
|
547 |
-
}
|
548 |
-
with open("frontend/public/results.json", "w") as f:
|
549 |
-
json.dump(all_tables, f, indent=2, ensure_ascii=False)
|
550 |
-
|
551 |
-
|
552 |
-
if __name__ == "__main__":
|
553 |
-
download_fleurs()
|
554 |
-
asyncio.run(main())
|
|
|
|
|
|
|
|
|
1 |
import random
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
import evaluate
|
|
|
4 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
5 |
from joblib.memory import Memory
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
from transformers import NllbTokenizer
|
7 |
+
from languages import languages, script_name
|
8 |
+
from datasets_.flores import flores_sentences
|
9 |
+
from models import complete, transcribe
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
cache = Memory(location=".cache", verbose=0).cache
|
11 |
bleu = evaluate.load("bleu")
|
12 |
chrf = evaluate.load("chrf")
|
13 |
wer = evaluate.load("wer")
|
14 |
tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
# sample languages to translate to
|
18 |
target_languages = languages[languages["in_benchmark"]].sample(
|
19 |
+
frac=1, weights="speakers", replace=True, random_state=42
|
20 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
@cache
|
23 |
async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
|
|
|
25 |
0
|
26 |
]
|
27 |
target_language = target_languages.iloc[sentence_nr]
|
28 |
+
original_sentence = flores_sentences(original_language)[sentence_nr].strip()
|
29 |
+
target_sentence = flores_sentences(target_language)[sentence_nr].strip()
|
30 |
script = script_name(target_language.flores_path.split("_")[1])
|
31 |
reply = await complete(
|
32 |
model=model,
|
|
|
70 |
@cache
|
71 |
async def classify_and_evaluate(model, language_bcp_47, nr):
|
72 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
73 |
+
sentences = pd.DataFrame(flores_sentences(language), columns=["text"])
|
74 |
sentences = pd.concat([metadata, sentences], axis=1)
|
75 |
sentences = sentences.dropna(subset=["topic"])
|
76 |
sentences["topic"] = sentences["topic"].str.lower()
|
|
|
139 |
@cache
|
140 |
async def mlm_and_evaluate(model, language_bcp_47, nr):
|
141 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
142 |
+
sentences = pd.DataFrame(flores_sentences(language), columns=["text"])
|
143 |
sentences["corrupt_text"] = sentences["text"].apply(corrupt_sentence)
|
144 |
examples = sentences.sample(n=10, random_state=42)
|
145 |
test_sentences = sentences[~sentences["text"].isin(examples["text"])].sample(
|
|
|
177 |
}
|
178 |
]
|
179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
@cache
|
181 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
182 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
|
|
214 |
classify_and_evaluate,
|
215 |
mlm_and_evaluate,
|
216 |
# transcribe_and_evaluate,
|
217 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
frontend/public/results.json
CHANGED
@@ -4,41 +4,51 @@
|
|
4 |
"rank": 1,
|
5 |
"provider": "Google",
|
6 |
"model": "Gemini 2.0 Flash 001",
|
7 |
-
"average": 0.
|
8 |
"classification_accuracy": 0.87,
|
9 |
"language_modeling_chrf": 0.96,
|
10 |
-
"translation_bleu": 0.
|
11 |
-
"translation_chrf": 0.
|
12 |
},
|
13 |
{
|
14 |
"rank": 2,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
"provider": "OpenAI",
|
16 |
"model": "GPT 4o Mini",
|
17 |
-
"average": 0.
|
18 |
-
"classification_accuracy": 0.
|
19 |
"language_modeling_chrf": 0.95,
|
20 |
-
"translation_bleu": 0.
|
21 |
-
"translation_chrf": 0.
|
22 |
},
|
23 |
{
|
24 |
-
"rank":
|
25 |
"provider": "MistralAI",
|
26 |
"model": "Mistral Small 24b Instruct 2501",
|
27 |
-
"average": 0.
|
28 |
-
"classification_accuracy": 0.
|
29 |
-
"language_modeling_chrf": 0.
|
30 |
-
"translation_bleu": 0.
|
31 |
-
"translation_chrf": 0.
|
32 |
},
|
33 |
{
|
34 |
-
"rank":
|
35 |
"provider": "Meta Llama",
|
36 |
"model": "Llama 3.3 70b Instruct",
|
37 |
-
"average": 0.
|
38 |
-
"classification_accuracy": 0.
|
39 |
"language_modeling_chrf": 0.94,
|
40 |
-
"translation_bleu": 0.
|
41 |
-
"translation_chrf": 0.
|
42 |
}
|
43 |
]
|
44 |
}
|
|
|
4 |
"rank": 1,
|
5 |
"provider": "Google",
|
6 |
"model": "Gemini 2.0 Flash 001",
|
7 |
+
"average": 0.72,
|
8 |
"classification_accuracy": 0.87,
|
9 |
"language_modeling_chrf": 0.96,
|
10 |
+
"translation_bleu": 0.45,
|
11 |
+
"translation_chrf": 0.58
|
12 |
},
|
13 |
{
|
14 |
"rank": 2,
|
15 |
+
"provider": "Google",
|
16 |
+
"model": "Gemma 3 27b It",
|
17 |
+
"average": 0.65,
|
18 |
+
"classification_accuracy": 0.72,
|
19 |
+
"language_modeling_chrf": 0.96,
|
20 |
+
"translation_bleu": 0.37,
|
21 |
+
"translation_chrf": 0.54
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"rank": 3,
|
25 |
"provider": "OpenAI",
|
26 |
"model": "GPT 4o Mini",
|
27 |
+
"average": 0.6,
|
28 |
+
"classification_accuracy": 0.52,
|
29 |
"language_modeling_chrf": 0.95,
|
30 |
+
"translation_bleu": 0.39,
|
31 |
+
"translation_chrf": 0.55
|
32 |
},
|
33 |
{
|
34 |
+
"rank": 4,
|
35 |
"provider": "MistralAI",
|
36 |
"model": "Mistral Small 24b Instruct 2501",
|
37 |
+
"average": 0.58,
|
38 |
+
"classification_accuracy": 0.55,
|
39 |
+
"language_modeling_chrf": 0.86,
|
40 |
+
"translation_bleu": 0.38,
|
41 |
+
"translation_chrf": 0.52
|
42 |
},
|
43 |
{
|
44 |
+
"rank": 5,
|
45 |
"provider": "Meta Llama",
|
46 |
"model": "Llama 3.3 70b Instruct",
|
47 |
+
"average": 0.56,
|
48 |
+
"classification_accuracy": 0.5,
|
49 |
"language_modeling_chrf": 0.94,
|
50 |
+
"translation_bleu": 0.31,
|
51 |
+
"translation_chrf": 0.48
|
52 |
}
|
53 |
]
|
54 |
}
|
results.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|