David Pomerenke commited on
Commit
da6e1bc
Β·
1 Parent(s): 4718a02

Refactor eval code into files

Browse files
evals/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
app.py β†’ evals/app.py RENAMED
File without changes
evals/datasets_/commonvoice.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from datetime import date
3
+
4
+ import pandas as pd
5
+ from joblib.memory import Memory
6
+ from langcodes import standardize_tag
7
+ from requests import get
8
+
9
+ cache = Memory(location=".cache", verbose=0).cache
10
+
11
+
12
+ # load CommonVoice stats
13
+ @cache # cache for 1 day
14
+ def get_commonvoice_stats(date: date):
15
+ return get("https://commonvoice.mozilla.org/api/v1/stats/languages").json()
16
+
17
+
18
+ commonvoice = pd.DataFrame(get_commonvoice_stats(date.today())).rename(
19
+ columns={"locale": "commonvoice_locale", "validatedHours": "commonvoice_hours"}
20
+ )[["commonvoice_locale", "commonvoice_hours"]]
21
+ # ignore country (language is language) (in practive this is only relevant to zh-CN/zh-TW/zh-HK)
22
+ commonvoice["bcp_47"] = commonvoice["commonvoice_locale"].apply(
23
+ lambda x: re.sub(r"-[A-Z]{2}$", "", x)
24
+ )
25
+ commonvoice["bcp_47"] = commonvoice["bcp_47"].apply(
26
+ lambda x: standardize_tag(x, macro=True)
27
+ ) # this does not really seem to get macrolanguages though, e.g. not for Quechua
28
+ commonvoice = (
29
+ commonvoice.groupby("bcp_47")
30
+ .agg({"commonvoice_hours": "sum", "commonvoice_locale": "first"})
31
+ .reset_index()
32
+ )
evals/datasets_/fleurs.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from langcodes import standardize_tag
3
+ from pathlib import Path
4
+ import tarfile
5
+ import requests
6
+
7
+ fleurs_tags = "af_za,am_et,ar_eg,as_in,ast_es,az_az,be_by,bg_bg,bn_in,bs_ba,ca_es,ceb_ph,ckb_iq,cmn_hans_cn,cs_cz,cy_gb,da_dk,de_de,el_gr,en_us,es_419,et_ee,fa_ir,ff_sn,fi_fi,fil_ph,fr_fr,ga_ie,gl_es,gu_in,ha_ng,he_il,hi_in,hr_hr,hu_hu,hy_am,id_id,ig_ng,is_is,it_it,ja_jp,jv_id,ka_ge,kam_ke,kea_cv,kk_kz,km_kh,kn_in,ko_kr,ky_kg,lb_lu,lg_ug,ln_cd,lo_la,lt_lt,luo_ke,lv_lv,mi_nz,mk_mk,ml_in,mn_mn,mr_in,ms_my,mt_mt,my_mm,nb_no,ne_np,nl_nl,nso_za,ny_mw,oc_fr,om_et,or_in,pa_in,pl_pl,ps_af,pt_br,ro_ro,ru_ru,sd_in,sk_sk,sl_si,sn_zw,so_so,sr_rs,sv_se,sw_ke,ta_in,te_in,tg_tj,th_th,tr_tr,uk_ua,umb_ao,ur_pk,uz_uz,vi_vn,wo_sn,xh_za,yo_ng,yue_hant_hk,zu_za"
8
+
9
+ fleurs = pd.DataFrame(fleurs_tags.split(","), columns=["fleurs_tag"])
10
+ fleurs["bcp_47"] = fleurs["fleurs_tag"].apply(
11
+ lambda x: standardize_tag(x.rsplit("_")[0], macro=True)
12
+ )
13
+
14
+ def download_file(url, path):
15
+ response = requests.get(url)
16
+ with open(path, "wb") as f:
17
+ f.write(response.content)
18
+
19
+
20
+ def download_fleurs(transcription_langs_eval):
21
+ # the huggingface loader does not allow loading only the dev set, so do it manually
22
+ for language in transcription_langs_eval.itertuples():
23
+ tar_url = f"https://huggingface.co/datasets/google/fleurs/resolve/main/data/{language.fleurs_tag}/audio/dev.tar.gz"
24
+ tar_path = Path(f"data/fleurs/{language.fleurs_tag}/audio/dev.tar.gz")
25
+ audio_path = Path(f"data/fleurs/{language.fleurs_tag}/audio")
26
+ if not audio_path.exists():
27
+ print(f"Downloading {tar_url} to {tar_path}")
28
+ tar_path.parent.mkdir(parents=True, exist_ok=True)
29
+ download_file(tar_url, tar_path)
30
+ with tarfile.open(tar_path, "r:gz") as tar:
31
+ tar.extractall(path=audio_path)
32
+ tsv_url = f"https://huggingface.co/datasets/google/fleurs/resolve/main/data/{language.fleurs_tag}/dev.tsv"
33
+ tsv_path = Path(f"data/fleurs/{language.fleurs_tag}/dev.tsv")
34
+ if not tsv_path.exists():
35
+ print(f"Downloading {tsv_url} to {tsv_path}")
36
+ tsv_path.parent.mkdir(parents=True, exist_ok=True)
37
+ download_file(tsv_url, tsv_path)
evals/datasets_/flores.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langcodes import Language, standardize_tag
2
+ import pandas as pd
3
+ import os
4
+ import re
5
+
6
+ flores_dir = "data/floresp-v2.0-rc.3/dev"
7
+
8
+ def flores_sentences(language):
9
+ return open(f"{flores_dir}/dev.{language.flores_path}").readlines()
10
+
11
+ def aggregate_flores_paths(flores_paths):
12
+ # takes a list of paths from the same language but different scripts
13
+ # returns the one with the largest writing population
14
+ if len(flores_paths) == 1:
15
+ return flores_paths.values[0]
16
+ populations = [
17
+ Language.get(standardize_tag(x, macro=True)).writing_population()
18
+ for x in flores_paths.values
19
+ ]
20
+ return flores_paths.values[populations.index(max(populations))]
21
+
22
+
23
+
24
+ flores = pd.DataFrame(
25
+ [f.split(".")[1] for f in os.listdir(flores_dir)],
26
+ columns=["flores_path"],
27
+ )
28
+ flores["bcp_47"] = flores["flores_path"].apply(
29
+ lambda x: standardize_tag(x, macro=True),
30
+ )
31
+ # ignore script (language is language)
32
+ flores["bcp_47"] = flores["bcp_47"].apply(
33
+ lambda x: re.sub(r"-[A-Z][a-z]+$", "", x)
34
+ )
35
+ flores = (
36
+ flores.groupby("bcp_47")
37
+ .agg({"flores_path": aggregate_flores_paths})
38
+ .reset_index()
39
+ )
40
+
evals/languages.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ import pandas as pd
4
+ from datasets_.commonvoice import commonvoice
5
+ from datasets_.fleurs import fleurs
6
+ from datasets_.flores import flores
7
+ from joblib.memory import Memory
8
+ from langcodes import Language, standardize_tag
9
+ from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
10
+
11
+ cache = Memory(location=".cache", verbose=0).cache
12
+
13
+ # load general language data
14
+ languages = {
15
+ lang: pop
16
+ for lang, pop in LANGUAGE_SPEAKING_POPULATION.items()
17
+ if not re.match(r".*-[A-Z]{2}$", lang)
18
+ }
19
+ languages = pd.DataFrame(list(languages.items()), columns=["bcp_47", "speakers"])
20
+ languages["language_name"] = languages["bcp_47"].apply(
21
+ lambda x: Language.get(x).display_name()
22
+ )
23
+
24
+ # load script codes and names
25
+ scripts = pd.read_csv("data/ScriptCodes.csv").rename(
26
+ columns={"Code": "iso15924", "English Name": "script_name"}
27
+ )
28
+
29
+
30
+ def population(bcp_47):
31
+ items = {
32
+ re.sub(r"^[a-z]+-", "", lang): pop
33
+ for lang, pop in LANGUAGE_SPEAKING_POPULATION.items()
34
+ if re.match(rf"^{bcp_47}-[A-Z]{{2}}$", lang)
35
+ }
36
+ return items
37
+
38
+
39
+ glottolog = pd.read_csv(
40
+ "data/glottolog_languoid.csv/languoid.csv", na_values=[""], keep_default_na=False
41
+ ) # Min _Nan_ Chinese is not N/A!
42
+ glottolog["bcp_47"] = glottolog["iso639P3code"].apply(
43
+ lambda x: standardize_tag(x, macro=True) if not pd.isna(x) else None
44
+ )
45
+
46
+
47
+ @cache
48
+ def language_family(bcp_47):
49
+ languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
50
+ if pd.isna(languoid["family_id"]):
51
+ return None
52
+ family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
53
+ return family["name"]
54
+
55
+
56
+ def script_name(iso15924):
57
+ return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
58
+
59
+
60
+ # merge data
61
+ # always "left" because keep it simple for now
62
+ languages = pd.merge(languages, flores, on="bcp_47", how="left")
63
+ languages = pd.merge(languages, fleurs, on="bcp_47", how="left")
64
+ languages = pd.merge(languages, commonvoice, on="bcp_47", how="left")
65
+ languages["in_benchmark"] = languages["bcp_47"].isin(flores["bcp_47"])
66
+ languages = languages.sort_values(by="speakers", ascending=False)
evals/main.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from rich import print
7
+ from tqdm.asyncio import tqdm_asyncio
8
+ from languages import languages
9
+ from tasks import tasks
10
+ from models import models, model_fast
11
+
12
+ # ===== config =====
13
+
14
+ n_sentences = 30
15
+ langs_eval = languages.iloc[:10]
16
+ langs_eval_detailed = languages.iloc[:2]
17
+ transcription_langs_eval = languages.iloc[:10]
18
+ transcription_langs_eval_detailed = languages.iloc[:5]
19
+
20
+ # ===== run evaluation and aggregate results =====
21
+
22
+ async def evaluate():
23
+ print("running evaluations")
24
+ results = [
25
+ task(model, original_language.bcp_47, i)
26
+ for task in tasks
27
+ for i in range(n_sentences)
28
+ for original_language in langs_eval.itertuples()
29
+ for model in models
30
+ if original_language.in_benchmark
31
+ and (
32
+ model == model_fast
33
+ or original_language.bcp_47 in langs_eval_detailed.bcp_47.values
34
+ )
35
+ ]
36
+ return await tqdm_asyncio.gather(*results, miniters=1)
37
+
38
+ def aggregate(results):
39
+ results = pd.DataFrame([r for rs in results for r in rs])
40
+ results = (
41
+ results.groupby(["model", "bcp_47", "task", "metric"]).mean().reset_index()
42
+ )
43
+ lang_results = (
44
+ results.groupby(["bcp_47", "task", "metric"])
45
+ .agg({"score": "mean", "model": "nunique"})
46
+ .reset_index()
47
+ )
48
+ lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
49
+ model_results = (
50
+ results.groupby(["model", "task", "metric"])
51
+ .agg({"score": "mean", "bcp_47": "nunique"})
52
+ .reset_index()
53
+ )
54
+ task_results = (
55
+ results.groupby(["task", "metric"])
56
+ .agg({"score": "mean", "bcp_47": "nunique", "model": "nunique"})
57
+ .reset_index()
58
+ )
59
+ return results, lang_results, model_results, task_results
60
+
61
+ def mean(lst):
62
+ return sum(lst) / len(lst) if lst else None
63
+
64
+
65
+ def fmt_name(s):
66
+ return " ".join(w.capitalize() for w in s.split("-")).replace("Gpt", "GPT").replace("ai", "AI")
67
+
68
+ def serialize(df):
69
+ return df.replace({np.nan: None}).to_dict(orient="records")
70
+
71
+ def make_model_table(model_results):
72
+ model_results["task_metric"] = model_results["task"] + "_" + model_results["metric"]
73
+ model_results = model_results.drop(columns=["task", "metric"])
74
+ model_table = model_results.pivot(
75
+ index="model", columns="task_metric", values="score"
76
+ ).fillna(0)
77
+ model_table["average"] = model_table.mean(axis=1)
78
+ model_table = model_table.sort_values(by="average", ascending=False)
79
+ model_table = model_table.round(2).reset_index()
80
+ model_table["provider"] = model_table["model"].str.split("/").str[0].apply(fmt_name)
81
+ model_table["model"] = model_table["model"].str.split("/").str[1].apply(fmt_name)
82
+ model_table["rank"] = model_table.index + 1
83
+ model_table = model_table[
84
+ ["rank", "provider", "model", "average", *model_table.columns[1:-3]]
85
+ ]
86
+ return model_table
87
+
88
+
89
+ async def main():
90
+ results = await evaluate()
91
+ results, lang_results, model_results, task_results = aggregate(results)
92
+ all_results = {
93
+ "tasks": serialize(task_results),
94
+ "models": serialize(model_results),
95
+ "languages": serialize(lang_results),
96
+ "scores": serialize(results),
97
+ }
98
+ with open("results.json", "w") as f:
99
+ json.dump(all_results, f, indent=2, ensure_ascii=False)
100
+
101
+ model_table = make_model_table(model_results)
102
+ all_tables = {
103
+ "model_table": serialize(model_table),
104
+ }
105
+ with open("frontend/public/results.json", "w") as f:
106
+ json.dump(all_tables, f, indent=2, ensure_ascii=False)
107
+
108
+
109
+ if __name__ == "__main__":
110
+ asyncio.run(main())
evals/models.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os import getenv
2
+
3
+ from aiolimiter import AsyncLimiter
4
+ from dotenv import load_dotenv
5
+ from elevenlabs import AsyncElevenLabs
6
+ from huggingface_hub import AsyncInferenceClient
7
+ from joblib.memory import Memory
8
+ from openai import AsyncOpenAI
9
+
10
+ # for development purposes, all languages will be evaluated on the fast models
11
+ # and only a sample of languages will be evaluated on all models
12
+ models = [
13
+ "openai/gpt-4o-mini", # 0.6$/M tokens
14
+ # "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive for dev
15
+ "meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
16
+ "mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
17
+ "google/gemini-2.0-flash-001", # 0.4$/M tokens
18
+ # "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
19
+ # "deepseek/deepseek-chat", # 0.9$/M tokens
20
+ # "microsoft/phi-4", # 0.07$/M tokens; only 16k tokens context
21
+ "google/gemma-3-27b-it", # 0.2$/M tokens
22
+ ]
23
+ model_fast = "meta-llama/llama-3.3-70b-instruct"
24
+
25
+ transcription_models = [
26
+ "elevenlabs/scribe_v1",
27
+ "openai/whisper-large-v3",
28
+ # "openai/whisper-small",
29
+ # "facebook/seamless-m4t-v2-large",
30
+ ]
31
+ transcription_model_fast = "elevenlabs/scribe_v1"
32
+
33
+ load_dotenv()
34
+ client = AsyncOpenAI(
35
+ base_url="https://openrouter.ai/api/v1",
36
+ api_key=getenv("OPENROUTER_API_KEY"),
37
+ )
38
+
39
+ cache = Memory(location=".cache", verbose=0).cache
40
+ openrouter_rate_limit = AsyncLimiter(max_rate=20, time_period=1)
41
+ elevenlabs_rate_limit = AsyncLimiter(max_rate=2, time_period=1)
42
+ huggingface_rate_limit = AsyncLimiter(max_rate=5, time_period=1)
43
+
44
+
45
+ @cache
46
+ async def complete(**kwargs):
47
+ async with openrouter_rate_limit:
48
+ response = await client.chat.completions.create(**kwargs)
49
+ if not response.choices:
50
+ raise Exception(response)
51
+ return response
52
+
53
+
54
+ @cache
55
+ async def transcribe_elevenlabs(path, model):
56
+ modelname = model.split("/")[-1]
57
+ client = AsyncElevenLabs(api_key=getenv("ELEVENLABS_API_KEY"))
58
+ async with elevenlabs_rate_limit:
59
+ with open(path, "rb") as file:
60
+ response = await client.speech_to_text.convert(
61
+ model_id=modelname, file=file
62
+ )
63
+ return response.text
64
+
65
+
66
+ @cache
67
+ async def transcribe_huggingface(path, model):
68
+ client = AsyncInferenceClient(api_key=getenv("HUGGINGFACE_ACCESS_TOKEN"))
69
+ async with huggingface_rate_limit:
70
+ output = await client.automatic_speech_recognition(model=model, audio=path)
71
+ return output.text
72
+
73
+
74
+ async def transcribe(path, model="elevenlabs/scribe_v1"):
75
+ provider, modelname = model.split("/")
76
+ match provider:
77
+ case "elevenlabs":
78
+ return await transcribe_elevenlabs(path, modelname)
79
+ case "openai" | "facebook":
80
+ return await transcribe_huggingface(path, model)
81
+ case _:
82
+ raise ValueError(f"Model {model} not supported")
evals.py β†’ evals/tasks.py RENAMED
@@ -1,249 +1,23 @@
1
- import asyncio
2
- import json
3
- import os
4
  import random
5
- import re
6
- import tarfile
7
- from datetime import date
8
- from os import getenv
9
- from pathlib import Path
10
 
11
  import evaluate
12
- import numpy as np
13
  import pandas as pd
14
- import requests
15
- from aiolimiter import AsyncLimiter
16
- from dotenv import load_dotenv
17
- from elevenlabs import AsyncElevenLabs
18
- from huggingface_hub import AsyncInferenceClient
19
  from joblib.memory import Memory
20
- from langcodes import Language, standardize_tag
21
- from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
22
- from openai import AsyncOpenAI
23
- from requests import get
24
- from rich import print
25
- from tqdm.asyncio import tqdm_asyncio
26
  from transformers import NllbTokenizer
27
-
28
- # ===== config =====
29
-
30
- # for development purposes, all languages will be evaluated on the fast models
31
- # and only a sample of languages will be evaluated on all models
32
- models = [
33
- "openai/gpt-4o-mini", # 0.6$/M tokens
34
- # "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive for dev
35
- "meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
36
- "mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
37
- "google/gemini-2.0-flash-001", # 0.4$/M tokens
38
- # "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
39
- # "deepseek/deepseek-chat", # 0.9$/M tokens
40
- # "microsoft/phi-4", # 0.07$/M tokens; only 16k tokens context
41
- ]
42
- model_fast = "meta-llama/llama-3.3-70b-instruct"
43
- n_languages = 50
44
- n_detailed_languages = 10
45
- n_sentences = 30
46
-
47
- transcription_models = [
48
- "elevenlabs/scribe_v1",
49
- "openai/whisper-large-v3",
50
- # "openai/whisper-small",
51
- # "facebook/seamless-m4t-v2-large",
52
- ]
53
- transcription_model_fast = "elevenlabs/scribe_v1"
54
- transcription_n_languages = 10
55
- transcription_n_detailed_languages = 5
56
-
57
- # ===== setup =====
58
-
59
- load_dotenv()
60
- client = AsyncOpenAI(
61
- base_url="https://openrouter.ai/api/v1",
62
- api_key=getenv("OPENROUTER_API_KEY"),
63
- )
64
  cache = Memory(location=".cache", verbose=0).cache
65
  bleu = evaluate.load("bleu")
66
  chrf = evaluate.load("chrf")
67
  wer = evaluate.load("wer")
68
  tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
69
- openrouter_rate_limit = AsyncLimiter(max_rate=20, time_period=1)
70
- elevenlabs_rate_limit = AsyncLimiter(max_rate=2, time_period=1)
71
- huggingface_rate_limit = AsyncLimiter(max_rate=5, time_period=1)
72
-
73
- # ===== load metadata =====
74
-
75
- # load general language data
76
- languages = {
77
- lang: pop
78
- for lang, pop in LANGUAGE_SPEAKING_POPULATION.items()
79
- if not re.match(r".*-[A-Z]{2}$", lang)
80
- }
81
- languages = pd.DataFrame(list(languages.items()), columns=["bcp_47", "speakers"])
82
- languages["language_name"] = languages["bcp_47"].apply(
83
- lambda x: Language.get(x).display_name()
84
- )
85
-
86
- # load script codes and names
87
- scripts = pd.read_csv("data/ScriptCodes.csv").rename(
88
- columns={"Code": "iso15924", "English Name": "script_name"}
89
- )
90
-
91
-
92
- def population(bcp_47):
93
- items = {
94
- re.sub(r"^[a-z]+-", "", lang): pop
95
- for lang, pop in LANGUAGE_SPEAKING_POPULATION.items()
96
- if re.match(rf"^{bcp_47}-[A-Z]{{2}}$", lang)
97
- }
98
- return items
99
-
100
-
101
- glottolog = pd.read_csv(
102
- "data/glottolog_languoid.csv/languoid.csv", na_values=[""], keep_default_na=False
103
- ) # Min _Nan_ Chinese is not N/A!
104
- glottolog["bcp_47"] = glottolog["iso639P3code"].apply(
105
- lambda x: standardize_tag(x, macro=True) if not pd.isna(x) else None
106
- )
107
-
108
-
109
- @cache
110
- def language_family(bcp_47):
111
- languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
112
- if pd.isna(languoid["family_id"]):
113
- return None
114
- family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
115
- return family["name"]
116
-
117
-
118
- def script_name(iso15924):
119
- return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
120
-
121
-
122
- def aggregate_flores_paths(flores_paths):
123
- # takes a list of paths from the same language but different scripts
124
- # returns the one with the largest writing population
125
- if len(flores_paths) == 1:
126
- return flores_paths.values[0]
127
- populations = [
128
- Language.get(standardize_tag(x, macro=True)).writing_population()
129
- for x in flores_paths.values
130
- ]
131
- return flores_paths.values[populations.index(max(populations))]
132
-
133
-
134
- # load benchmark languages and scripts
135
- benchmark_dir = "data/floresp-v2.0-rc.3/dev"
136
- benchmark_languages = pd.DataFrame(
137
- [f.split(".")[1] for f in os.listdir(benchmark_dir)],
138
- columns=["flores_path"],
139
- )
140
- benchmark_languages["bcp_47"] = benchmark_languages["flores_path"].apply(
141
- lambda x: standardize_tag(x, macro=True),
142
- )
143
- # ignore script (language is language)
144
- benchmark_languages["bcp_47"] = benchmark_languages["bcp_47"].apply(
145
- lambda x: re.sub(r"-[A-Z][a-z]+$", "", x)
146
- )
147
- benchmark_languages = (
148
- benchmark_languages.groupby("bcp_47")
149
- .agg({"flores_path": aggregate_flores_paths})
150
- .reset_index()
151
- )
152
-
153
- fleurs_tags = "af_za,am_et,ar_eg,as_in,ast_es,az_az,be_by,bg_bg,bn_in,bs_ba,ca_es,ceb_ph,ckb_iq,cmn_hans_cn,cs_cz,cy_gb,da_dk,de_de,el_gr,en_us,es_419,et_ee,fa_ir,ff_sn,fi_fi,fil_ph,fr_fr,ga_ie,gl_es,gu_in,ha_ng,he_il,hi_in,hr_hr,hu_hu,hy_am,id_id,ig_ng,is_is,it_it,ja_jp,jv_id,ka_ge,kam_ke,kea_cv,kk_kz,km_kh,kn_in,ko_kr,ky_kg,lb_lu,lg_ug,ln_cd,lo_la,lt_lt,luo_ke,lv_lv,mi_nz,mk_mk,ml_in,mn_mn,mr_in,ms_my,mt_mt,my_mm,nb_no,ne_np,nl_nl,nso_za,ny_mw,oc_fr,om_et,or_in,pa_in,pl_pl,ps_af,pt_br,ro_ro,ru_ru,sd_in,sk_sk,sl_si,sn_zw,so_so,sr_rs,sv_se,sw_ke,ta_in,te_in,tg_tj,th_th,tr_tr,uk_ua,umb_ao,ur_pk,uz_uz,vi_vn,wo_sn,xh_za,yo_ng,yue_hant_hk,zu_za"
154
- fleurs = pd.DataFrame(fleurs_tags.split(","), columns=["fleurs_tag"])
155
- fleurs["bcp_47"] = fleurs["fleurs_tag"].apply(
156
- lambda x: standardize_tag(x.rsplit("_")[0], macro=True)
157
- )
158
-
159
-
160
- # load CommonVoice stats
161
- @cache # cache for 1 day
162
- def get_commonvoice_stats(date: date):
163
- return get("https://commonvoice.mozilla.org/api/v1/stats/languages").json()
164
-
165
-
166
- commonvoice_stats = pd.DataFrame(get_commonvoice_stats(date.today())).rename(
167
- columns={"locale": "commonvoice_locale", "validatedHours": "commonvoice_hours"}
168
- )[["commonvoice_locale", "commonvoice_hours"]]
169
- # ignore country (language is language) (in practive this is only relevant to zh-CN/zh-TW/zh-HK)
170
- commonvoice_stats["bcp_47"] = commonvoice_stats["commonvoice_locale"].apply(
171
- lambda x: re.sub(r"-[A-Z]{2}$", "", x)
172
- )
173
- commonvoice_stats["bcp_47"] = commonvoice_stats["bcp_47"].apply(
174
- lambda x: standardize_tag(x, macro=True)
175
- ) # this does not really seem to get macrolanguages though, e.g. not for Quechua
176
- commonvoice_stats = (
177
- commonvoice_stats.groupby("bcp_47")
178
- .agg({"commonvoice_hours": "sum", "commonvoice_locale": "first"})
179
- .reset_index()
180
- )
181
 
182
- # merge data
183
- languages = pd.merge(
184
- languages, benchmark_languages, on="bcp_47", how="left"
185
- ) # "left" because keep it simple for now
186
- languages = pd.merge(
187
- languages, fleurs, on="bcp_47", how="left"
188
- ) # "left" because keep it simple for now
189
- languages = pd.merge(
190
- languages, commonvoice_stats, on="bcp_47", how="left"
191
- ) # "left" because keep it simple for now
192
- languages["in_benchmark"] = languages["bcp_47"].isin(benchmark_languages["bcp_47"])
193
-
194
- languages = languages.sort_values(by="speakers", ascending=False)
195
 
196
  # sample languages to translate to
197
  target_languages = languages[languages["in_benchmark"]].sample(
198
- n=n_sentences, weights="speakers", replace=True, random_state=42
199
  )
200
- langs_eval = languages.iloc[:n_languages]
201
- langs_eval_detailed = languages.iloc[:n_detailed_languages]
202
- transcription_langs_eval = languages.iloc[:transcription_n_languages]
203
- transcription_langs_eval_detailed = languages.iloc[:transcription_n_detailed_languages]
204
-
205
-
206
- def download_file(url, path):
207
- response = requests.get(url)
208
- with open(path, "wb") as f:
209
- f.write(response.content)
210
-
211
-
212
- def download_fleurs():
213
- # the huggingface loader does not allow loading only the dev set, so do it manually
214
- for language in transcription_langs_eval.itertuples():
215
- tar_url = f"https://huggingface.co/datasets/google/fleurs/resolve/main/data/{language.fleurs_tag}/audio/dev.tar.gz"
216
- tar_path = Path(f"data/fleurs/{language.fleurs_tag}/audio/dev.tar.gz")
217
- audio_path = Path(f"data/fleurs/{language.fleurs_tag}/audio")
218
- if not audio_path.exists():
219
- print(f"Downloading {tar_url} to {tar_path}")
220
- tar_path.parent.mkdir(parents=True, exist_ok=True)
221
- download_file(tar_url, tar_path)
222
- with tarfile.open(tar_path, "r:gz") as tar:
223
- tar.extractall(path=audio_path)
224
- tsv_url = f"https://huggingface.co/datasets/google/fleurs/resolve/main/data/{language.fleurs_tag}/dev.tsv"
225
- tsv_path = Path(f"data/fleurs/{language.fleurs_tag}/dev.tsv")
226
- if not tsv_path.exists():
227
- print(f"Downloading {tsv_url} to {tsv_path}")
228
- tsv_path.parent.mkdir(parents=True, exist_ok=True)
229
- download_file(tsv_url, tsv_path)
230
-
231
-
232
- # ===== define tasks and metrics =====
233
-
234
-
235
- @cache
236
- async def complete(**kwargs):
237
- async with openrouter_rate_limit:
238
- response = await client.chat.completions.create(**kwargs)
239
- if not response.choices:
240
- raise Exception(response)
241
- return response
242
-
243
-
244
- def load_sentences(language):
245
- return open(f"{benchmark_dir}/dev.{language.flores_path}").readlines()
246
-
247
 
248
  @cache
249
  async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
@@ -251,8 +25,8 @@ async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
251
  0
252
  ]
253
  target_language = target_languages.iloc[sentence_nr]
254
- original_sentence = load_sentences(original_language)[sentence_nr].strip()
255
- target_sentence = load_sentences(target_language)[sentence_nr].strip()
256
  script = script_name(target_language.flores_path.split("_")[1])
257
  reply = await complete(
258
  model=model,
@@ -296,7 +70,7 @@ metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
296
  @cache
297
  async def classify_and_evaluate(model, language_bcp_47, nr):
298
  language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
299
- sentences = pd.DataFrame(load_sentences(language), columns=["text"])
300
  sentences = pd.concat([metadata, sentences], axis=1)
301
  sentences = sentences.dropna(subset=["topic"])
302
  sentences["topic"] = sentences["topic"].str.lower()
@@ -365,7 +139,7 @@ def corrupt_sentence(sentence):
365
  @cache
366
  async def mlm_and_evaluate(model, language_bcp_47, nr):
367
  language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
368
- sentences = pd.DataFrame(load_sentences(language), columns=["text"])
369
  sentences["corrupt_text"] = sentences["text"].apply(corrupt_sentence)
370
  examples = sentences.sample(n=10, random_state=42)
371
  test_sentences = sentences[~sentences["text"].isin(examples["text"])].sample(
@@ -403,38 +177,6 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
403
  }
404
  ]
405
 
406
-
407
- @cache
408
- async def transcribe_elevenlabs(path, model):
409
- modelname = model.split("/")[-1]
410
- client = AsyncElevenLabs(api_key=getenv("ELEVENLABS_API_KEY"))
411
- async with elevenlabs_rate_limit:
412
- with open(path, "rb") as file:
413
- response = await client.speech_to_text.convert(
414
- model_id=modelname, file=file
415
- )
416
- return response.text
417
-
418
-
419
- @cache
420
- async def transcribe_huggingface(path, model):
421
- client = AsyncInferenceClient(api_key=getenv("HUGGINGFACE_ACCESS_TOKEN"))
422
- async with huggingface_rate_limit:
423
- output = await client.automatic_speech_recognition(model=model, audio=path)
424
- return output.text
425
-
426
-
427
- async def transcribe(path, model="elevenlabs/scribe_v1"):
428
- provider, modelname = model.split("/")
429
- match provider:
430
- case "elevenlabs":
431
- return await transcribe_elevenlabs(path, modelname)
432
- case "openai" | "facebook":
433
- return await transcribe_huggingface(path, model)
434
- case _:
435
- raise ValueError(f"Model {model} not supported")
436
-
437
-
438
  @cache
439
  async def transcribe_and_evaluate(model, language_bcp_47, nr):
440
  language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
@@ -472,83 +214,4 @@ tasks = [
472
  classify_and_evaluate,
473
  mlm_and_evaluate,
474
  # transcribe_and_evaluate,
475
- ]
476
-
477
- # ===== run evaluation and aggregate results =====
478
-
479
-
480
- def mean(lst):
481
- return sum(lst) / len(lst) if lst else None
482
-
483
-
484
- def fmt_name(s):
485
- return " ".join(w.capitalize() for w in s.split("-")).replace("Gpt", "GPT").replace("ai", "AI")
486
-
487
-
488
- async def main():
489
- print("running evaluations")
490
- results = [
491
- task(model, original_language.bcp_47, i)
492
- for task in tasks
493
- for i in range(n_sentences)
494
- for original_language in langs_eval.itertuples()
495
- for model in models
496
- if original_language.in_benchmark
497
- and (
498
- model == model_fast
499
- or original_language.bcp_47 in langs_eval_detailed.bcp_47.values
500
- )
501
- ]
502
- results = await tqdm_asyncio.gather(*results, miniters=1)
503
- results = pd.DataFrame([r for rs in results for r in rs])
504
- results = (
505
- results.groupby(["model", "bcp_47", "task", "metric"]).mean().reset_index()
506
- )
507
- lang_results = (
508
- results.groupby(["bcp_47", "task", "metric"])
509
- .agg({"score": "mean", "model": "nunique"})
510
- .reset_index()
511
- )
512
- lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
513
- model_results = (
514
- results.groupby(["model", "task", "metric"])
515
- .agg({"score": "mean", "bcp_47": "nunique"})
516
- .reset_index()
517
- )
518
- task_results = (
519
- results.groupby(["task", "metric"])
520
- .agg({"score": "mean", "bcp_47": "nunique", "model": "nunique"})
521
- .reset_index()
522
- )
523
- all_results = {
524
- "tasks": task_results.replace({np.nan: None}).to_dict(orient="records"),
525
- "models": model_results.replace({np.nan: None}).to_dict(orient="records"),
526
- "languages": lang_results.replace({np.nan: None}).to_dict(orient="records"),
527
- "scores": results.replace({np.nan: None}).to_dict(orient="records"),
528
- }
529
- with open("results.json", "w") as f:
530
- json.dump(all_results, f, indent=2, ensure_ascii=False)
531
- model_results["task_metric"] = model_results["task"] + "_" + model_results["metric"]
532
- model_results = model_results.drop(columns=["task", "metric"])
533
- model_table = model_results.pivot(
534
- index="model", columns="task_metric", values="score"
535
- ).fillna(0)
536
- model_table["average"] = model_table.mean(axis=1)
537
- model_table = model_table.sort_values(by="average", ascending=False)
538
- model_table = model_table.round(2).reset_index()
539
- model_table["provider"] = model_table["model"].str.split("/").str[0].apply(fmt_name)
540
- model_table["model"] = model_table["model"].str.split("/").str[1].apply(fmt_name)
541
- model_table["rank"] = model_table.index + 1
542
- model_table = model_table[
543
- ["rank", "provider", "model", "average", *model_table.columns[1:-3]]
544
- ]
545
- all_tables = {
546
- "model_table": model_table.to_dict(orient="records"),
547
- }
548
- with open("frontend/public/results.json", "w") as f:
549
- json.dump(all_tables, f, indent=2, ensure_ascii=False)
550
-
551
-
552
- if __name__ == "__main__":
553
- download_fleurs()
554
- asyncio.run(main())
 
 
 
 
1
  import random
 
 
 
 
 
2
 
3
  import evaluate
 
4
  import pandas as pd
 
 
 
 
 
5
  from joblib.memory import Memory
 
 
 
 
 
 
6
  from transformers import NllbTokenizer
7
+ from languages import languages, script_name
8
+ from datasets_.flores import flores_sentences
9
+ from models import complete, transcribe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  cache = Memory(location=".cache", verbose=0).cache
11
  bleu = evaluate.load("bleu")
12
  chrf = evaluate.load("chrf")
13
  wer = evaluate.load("wer")
14
  tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  # sample languages to translate to
18
  target_languages = languages[languages["in_benchmark"]].sample(
19
+ frac=1, weights="speakers", replace=True, random_state=42
20
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  @cache
23
  async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
 
25
  0
26
  ]
27
  target_language = target_languages.iloc[sentence_nr]
28
+ original_sentence = flores_sentences(original_language)[sentence_nr].strip()
29
+ target_sentence = flores_sentences(target_language)[sentence_nr].strip()
30
  script = script_name(target_language.flores_path.split("_")[1])
31
  reply = await complete(
32
  model=model,
 
70
  @cache
71
  async def classify_and_evaluate(model, language_bcp_47, nr):
72
  language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
73
+ sentences = pd.DataFrame(flores_sentences(language), columns=["text"])
74
  sentences = pd.concat([metadata, sentences], axis=1)
75
  sentences = sentences.dropna(subset=["topic"])
76
  sentences["topic"] = sentences["topic"].str.lower()
 
139
  @cache
140
  async def mlm_and_evaluate(model, language_bcp_47, nr):
141
  language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
142
+ sentences = pd.DataFrame(flores_sentences(language), columns=["text"])
143
  sentences["corrupt_text"] = sentences["text"].apply(corrupt_sentence)
144
  examples = sentences.sample(n=10, random_state=42)
145
  test_sentences = sentences[~sentences["text"].isin(examples["text"])].sample(
 
177
  }
178
  ]
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  @cache
181
  async def transcribe_and_evaluate(model, language_bcp_47, nr):
182
  language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
 
214
  classify_and_evaluate,
215
  mlm_and_evaluate,
216
  # transcribe_and_evaluate,
217
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
frontend/public/results.json CHANGED
@@ -4,41 +4,51 @@
4
  "rank": 1,
5
  "provider": "Google",
6
  "model": "Gemini 2.0 Flash 001",
7
- "average": 0.68,
8
  "classification_accuracy": 0.87,
9
  "language_modeling_chrf": 0.96,
10
- "translation_bleu": 0.36,
11
- "translation_chrf": 0.53
12
  },
13
  {
14
  "rank": 2,
 
 
 
 
 
 
 
 
 
 
15
  "provider": "OpenAI",
16
  "model": "GPT 4o Mini",
17
- "average": 0.56,
18
- "classification_accuracy": 0.51,
19
  "language_modeling_chrf": 0.95,
20
- "translation_bleu": 0.31,
21
- "translation_chrf": 0.47
22
  },
23
  {
24
- "rank": 3,
25
  "provider": "MistralAI",
26
  "model": "Mistral Small 24b Instruct 2501",
27
- "average": 0.54,
28
- "classification_accuracy": 0.57,
29
- "language_modeling_chrf": 0.9,
30
- "translation_bleu": 0.26,
31
- "translation_chrf": 0.42
32
  },
33
  {
34
- "rank": 4,
35
  "provider": "Meta Llama",
36
  "model": "Llama 3.3 70b Instruct",
37
- "average": 0.53,
38
- "classification_accuracy": 0.51,
39
  "language_modeling_chrf": 0.94,
40
- "translation_bleu": 0.25,
41
- "translation_chrf": 0.43
42
  }
43
  ]
44
  }
 
4
  "rank": 1,
5
  "provider": "Google",
6
  "model": "Gemini 2.0 Flash 001",
7
+ "average": 0.72,
8
  "classification_accuracy": 0.87,
9
  "language_modeling_chrf": 0.96,
10
+ "translation_bleu": 0.45,
11
+ "translation_chrf": 0.58
12
  },
13
  {
14
  "rank": 2,
15
+ "provider": "Google",
16
+ "model": "Gemma 3 27b It",
17
+ "average": 0.65,
18
+ "classification_accuracy": 0.72,
19
+ "language_modeling_chrf": 0.96,
20
+ "translation_bleu": 0.37,
21
+ "translation_chrf": 0.54
22
+ },
23
+ {
24
+ "rank": 3,
25
  "provider": "OpenAI",
26
  "model": "GPT 4o Mini",
27
+ "average": 0.6,
28
+ "classification_accuracy": 0.52,
29
  "language_modeling_chrf": 0.95,
30
+ "translation_bleu": 0.39,
31
+ "translation_chrf": 0.55
32
  },
33
  {
34
+ "rank": 4,
35
  "provider": "MistralAI",
36
  "model": "Mistral Small 24b Instruct 2501",
37
+ "average": 0.58,
38
+ "classification_accuracy": 0.55,
39
+ "language_modeling_chrf": 0.86,
40
+ "translation_bleu": 0.38,
41
+ "translation_chrf": 0.52
42
  },
43
  {
44
+ "rank": 5,
45
  "provider": "Meta Llama",
46
  "model": "Llama 3.3 70b Instruct",
47
+ "average": 0.56,
48
+ "classification_accuracy": 0.5,
49
  "language_modeling_chrf": 0.94,
50
+ "translation_bleu": 0.31,
51
+ "translation_chrf": 0.48
52
  }
53
  ]
54
  }
results.json CHANGED
The diff for this file is too large to render. See raw diff