David Pomerenke
commited on
Commit
·
2f9dee1
1
Parent(s):
019cada
Only run tasks for which there is no result yet
Browse files- evals/backend.py +3 -5
- evals/datasets_/flores.py +5 -2
- evals/main.py +19 -18
- evals/tasks.py +18 -10
- languages.json +0 -0
- models.json +222 -0
- results.json +0 -0
evals/backend.py
CHANGED
@@ -11,11 +11,9 @@ from fastapi.middleware.gzip import GZipMiddleware
|
|
11 |
from fastapi.responses import JSONResponse
|
12 |
from fastapi.staticfiles import StaticFiles
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
languages = pd.DataFrame(results["languages"])
|
18 |
-
models = pd.DataFrame(results["models"])
|
19 |
|
20 |
|
21 |
def mean(lst):
|
|
|
11 |
from fastapi.responses import JSONResponse
|
12 |
from fastapi.staticfiles import StaticFiles
|
13 |
|
14 |
+
scores = pd.read_json("results.json")
|
15 |
+
languages = pd.read_json("languages.json")
|
16 |
+
models = pd.read_json("models.json")
|
|
|
|
|
17 |
|
18 |
|
19 |
def mean(lst):
|
evals/datasets_/flores.py
CHANGED
@@ -5,8 +5,11 @@ import re
|
|
5 |
|
6 |
flores_dir = "data/floresp-v2.0-rc.3/dev"
|
7 |
|
8 |
-
def flores_sentences(language):
|
9 |
-
|
|
|
|
|
|
|
10 |
|
11 |
def aggregate_flores_paths(flores_paths):
|
12 |
# takes a list of paths from the same language but different scripts
|
|
|
5 |
|
6 |
flores_dir = "data/floresp-v2.0-rc.3/dev"
|
7 |
|
8 |
+
def flores_sentences(language) -> list[str] | None:
|
9 |
+
try:
|
10 |
+
return open(f"{flores_dir}/dev.{language.flores_path}").readlines()
|
11 |
+
except FileNotFoundError:
|
12 |
+
return None
|
13 |
|
14 |
def aggregate_flores_paths(flores_paths):
|
15 |
# takes a list of paths from the same language but different scripts
|
evals/main.py
CHANGED
@@ -20,31 +20,32 @@ n_models = 25
|
|
20 |
|
21 |
async def evaluate():
|
22 |
print("running evaluations")
|
|
|
23 |
results = [
|
24 |
task(model, lang.bcp_47, i)
|
25 |
-
for task in tasks
|
26 |
for i in range(n_sentences)
|
27 |
for lang in languages.iloc[:n_languages].itertuples()
|
28 |
for model in models["id"].iloc[:n_models]
|
29 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
]
|
31 |
-
|
32 |
-
|
33 |
-
def serialize(df):
|
34 |
-
return df.replace({np.nan: None, pd.NA: None}).to_dict(orient="records")
|
35 |
-
|
36 |
-
async def main():
|
37 |
-
models["creation_date"] = models["creation_date"].apply(lambda x: x.isoformat())
|
38 |
-
results = await evaluate()
|
39 |
results = [r for group in results for r in group]
|
40 |
-
results =
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
json.dump(results, f, indent=2, ensure_ascii=False)
|
47 |
|
48 |
|
49 |
if __name__ == "__main__":
|
50 |
-
asyncio.run(
|
|
|
20 |
|
21 |
async def evaluate():
|
22 |
print("running evaluations")
|
23 |
+
old_results = pd.read_json("results.json")
|
24 |
results = [
|
25 |
task(model, lang.bcp_47, i)
|
26 |
+
for task_name, task in tasks.items()
|
27 |
for i in range(n_sentences)
|
28 |
for lang in languages.iloc[:n_languages].itertuples()
|
29 |
for model in models["id"].iloc[:n_models]
|
30 |
+
if len(
|
31 |
+
old_results[
|
32 |
+
(old_results["model"] == model)
|
33 |
+
& (old_results["bcp_47"] == lang.bcp_47)
|
34 |
+
& (old_results["task"] == task_name)
|
35 |
+
& (old_results["sentence_nr"] == i)
|
36 |
+
]
|
37 |
+
)
|
38 |
+
== 0
|
39 |
]
|
40 |
+
results = await tqdm_asyncio.gather(*results, miniters=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
results = [r for group in results for r in group]
|
42 |
+
results = pd.DataFrame(results)
|
43 |
+
results = pd.concat([old_results, results])
|
44 |
+
args = dict(orient="records", indent=2, force_ascii=False)
|
45 |
+
results.to_json("results.json", **args)
|
46 |
+
pd.DataFrame(models).to_json("models.json", **args)
|
47 |
+
pd.DataFrame(languages).to_json("languages.json", **args)
|
|
|
48 |
|
49 |
|
50 |
if __name__ == "__main__":
|
51 |
+
results = asyncio.run(evaluate())
|
evals/tasks.py
CHANGED
@@ -33,6 +33,8 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
33 |
pass
|
34 |
case "to":
|
35 |
original_language, target_language = target_language, original_language
|
|
|
|
|
36 |
original_sentence = flores_sentences(original_language)[sentence_nr].strip()
|
37 |
target_sentence = flores_sentences(target_language)[sentence_nr].strip()
|
38 |
script = script_name(target_language.flores_path.split("_")[1])
|
@@ -79,7 +81,10 @@ metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
|
|
79 |
@cache
|
80 |
async def classify_and_evaluate(model, bcp_47, nr):
|
81 |
language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
82 |
-
sentences =
|
|
|
|
|
|
|
83 |
sentences = pd.concat([metadata, sentences], axis=1)
|
84 |
sentences = sentences.dropna(subset=["topic"])
|
85 |
sentences["topic"] = sentences["topic"].str.lower()
|
@@ -159,7 +164,10 @@ def corrupt_sentence(sentence):
|
|
159 |
@cache
|
160 |
async def mlm_and_evaluate(model, language_bcp_47, nr):
|
161 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
162 |
-
sentences =
|
|
|
|
|
|
|
163 |
sentences["corrupt_text"] = sentences["text"].apply(corrupt_sentence)
|
164 |
examples = sentences.sample(n=10, random_state=42)
|
165 |
test_sentences = sentences[~sentences["text"].isin(examples["text"])].sample(
|
@@ -278,11 +286,11 @@ async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
|
278 |
]
|
279 |
|
280 |
|
281 |
-
tasks =
|
282 |
-
partial(translate_and_evaluate, mode="from"),
|
283 |
-
partial(translate_and_evaluate, mode="to"),
|
284 |
-
classify_and_evaluate,
|
285 |
-
# mlm_and_evaluate,
|
286 |
-
mmlu_and_evaluate,
|
287 |
-
# transcribe_and_evaluate,
|
288 |
-
|
|
|
33 |
pass
|
34 |
case "to":
|
35 |
original_language, target_language = target_language, original_language
|
36 |
+
if not flores_sentences(original_language) or not flores_sentences(target_language):
|
37 |
+
return []
|
38 |
original_sentence = flores_sentences(original_language)[sentence_nr].strip()
|
39 |
target_sentence = flores_sentences(target_language)[sentence_nr].strip()
|
40 |
script = script_name(target_language.flores_path.split("_")[1])
|
|
|
81 |
@cache
|
82 |
async def classify_and_evaluate(model, bcp_47, nr):
|
83 |
language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
84 |
+
sentences = flores_sentences(language)
|
85 |
+
if not sentences:
|
86 |
+
return []
|
87 |
+
sentences = pd.DataFrame(sentences, columns=["text"])
|
88 |
sentences = pd.concat([metadata, sentences], axis=1)
|
89 |
sentences = sentences.dropna(subset=["topic"])
|
90 |
sentences["topic"] = sentences["topic"].str.lower()
|
|
|
164 |
@cache
|
165 |
async def mlm_and_evaluate(model, language_bcp_47, nr):
|
166 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
167 |
+
sentences = flores_sentences(language)
|
168 |
+
if not sentences:
|
169 |
+
return []
|
170 |
+
sentences = pd.DataFrame(sentences, columns=["text"])
|
171 |
sentences["corrupt_text"] = sentences["text"].apply(corrupt_sentence)
|
172 |
examples = sentences.sample(n=10, random_state=42)
|
173 |
test_sentences = sentences[~sentences["text"].isin(examples["text"])].sample(
|
|
|
286 |
]
|
287 |
|
288 |
|
289 |
+
tasks = {
|
290 |
+
"translation_from": partial(translate_and_evaluate, mode="from"),
|
291 |
+
"translation_to": partial(translate_and_evaluate, mode="to"),
|
292 |
+
"classification": classify_and_evaluate,
|
293 |
+
# "mlm": mlm_and_evaluate,
|
294 |
+
"mmlu": mmlu_and_evaluate,
|
295 |
+
# "asr": transcribe_and_evaluate,
|
296 |
+
}
|
languages.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models.json
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"id":"meta-llama\/llama-4-maverick",
|
4 |
+
"name":"Llama 4 Maverick (free)",
|
5 |
+
"provider_name":"Meta",
|
6 |
+
"cost":0.0,
|
7 |
+
"hf_id":"meta-llama\/Llama-4-Maverick-17B-128E-Instruct",
|
8 |
+
"size":401583781376.0,
|
9 |
+
"type":"Open",
|
10 |
+
"license":"Other",
|
11 |
+
"creation_date":1743465600000
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"id":"meta-llama\/llama-3.3-70b-instruct",
|
15 |
+
"name":"Llama 3.3 70B Instruct (free)",
|
16 |
+
"provider_name":"Meta",
|
17 |
+
"cost":0.0,
|
18 |
+
"hf_id":"meta-llama\/Llama-3.3-70B-Instruct",
|
19 |
+
"size":70553706496.0,
|
20 |
+
"type":"Open",
|
21 |
+
"license":"Llama3.3",
|
22 |
+
"creation_date":1732579200000
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"id":"meta-llama\/llama-3.1-70b-instruct",
|
26 |
+
"name":"Llama 3.1 70B Instruct",
|
27 |
+
"provider_name":"Meta",
|
28 |
+
"cost":0.28,
|
29 |
+
"hf_id":"meta-llama\/Llama-3.1-70B-Instruct",
|
30 |
+
"size":70553706496.0,
|
31 |
+
"type":"Open",
|
32 |
+
"license":"Llama3.1",
|
33 |
+
"creation_date":1721088000000
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"id":"meta-llama\/llama-3-70b-instruct",
|
37 |
+
"name":"Llama 3 70B Instruct",
|
38 |
+
"provider_name":"Meta",
|
39 |
+
"cost":0.4,
|
40 |
+
"hf_id":"meta-llama\/Meta-Llama-3-70B-Instruct",
|
41 |
+
"size":70553706496.0,
|
42 |
+
"type":"Open",
|
43 |
+
"license":"Llama3",
|
44 |
+
"creation_date":1713312000000
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"id":"openai\/gpt-4.1-mini",
|
48 |
+
"name":"GPT-4.1 Mini",
|
49 |
+
"provider_name":"OpenAI",
|
50 |
+
"cost":1.6,
|
51 |
+
"hf_id":null,
|
52 |
+
"size":null,
|
53 |
+
"type":"Commercial",
|
54 |
+
"license":null,
|
55 |
+
"creation_date":1744588800000
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"id":"openai\/gpt-4.1-nano",
|
59 |
+
"name":"GPT-4.1 Nano",
|
60 |
+
"provider_name":"OpenAI",
|
61 |
+
"cost":0.4,
|
62 |
+
"hf_id":null,
|
63 |
+
"size":null,
|
64 |
+
"type":"Commercial",
|
65 |
+
"license":null,
|
66 |
+
"creation_date":1744588800000
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"id":"openai\/gpt-4o-mini",
|
70 |
+
"name":"GPT-4o-mini",
|
71 |
+
"provider_name":"OpenAI",
|
72 |
+
"cost":0.6,
|
73 |
+
"hf_id":null,
|
74 |
+
"size":null,
|
75 |
+
"type":"Commercial",
|
76 |
+
"license":null,
|
77 |
+
"creation_date":1721260800000
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"id":"openai\/gpt-3.5-turbo-0613",
|
81 |
+
"name":"GPT-3.5 Turbo (older v0613)",
|
82 |
+
"provider_name":"OpenAI",
|
83 |
+
"cost":2.0,
|
84 |
+
"hf_id":null,
|
85 |
+
"size":null,
|
86 |
+
"type":"Commercial",
|
87 |
+
"license":null,
|
88 |
+
"creation_date":1706140800000
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"id":"openai\/gpt-3.5-turbo",
|
92 |
+
"name":"GPT-3.5 Turbo",
|
93 |
+
"provider_name":"OpenAI",
|
94 |
+
"cost":1.5,
|
95 |
+
"hf_id":null,
|
96 |
+
"size":null,
|
97 |
+
"type":"Commercial",
|
98 |
+
"license":null,
|
99 |
+
"creation_date":1685232000000
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"id":"mistralai\/mistral-small-3.1-24b-instruct",
|
103 |
+
"name":"Mistral Small 3.1 24B (free)",
|
104 |
+
"provider_name":"Mistral",
|
105 |
+
"cost":0.0,
|
106 |
+
"hf_id":"mistralai\/Mistral-Small-3.1-24B-Instruct-2503",
|
107 |
+
"size":24011361280.0,
|
108 |
+
"type":"Open",
|
109 |
+
"license":"Apache 2.0",
|
110 |
+
"creation_date":1741651200000
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"id":"mistralai\/mistral-saba",
|
114 |
+
"name":"Saba",
|
115 |
+
"provider_name":"Mistral",
|
116 |
+
"cost":0.6,
|
117 |
+
"hf_id":null,
|
118 |
+
"size":null,
|
119 |
+
"type":"Commercial",
|
120 |
+
"license":null,
|
121 |
+
"creation_date":1739750400000
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"id":"mistralai\/mistral-nemo",
|
125 |
+
"name":"Mistral Nemo (free)",
|
126 |
+
"provider_name":"Mistral",
|
127 |
+
"cost":0.0,
|
128 |
+
"hf_id":"mistralai\/Mistral-Nemo-Instruct-2407",
|
129 |
+
"size":12247782400.0,
|
130 |
+
"type":"Open",
|
131 |
+
"license":"Apache 2.0",
|
132 |
+
"creation_date":1721174400000
|
133 |
+
},
|
134 |
+
{
|
135 |
+
"id":"google\/gemini-2.5-flash-preview",
|
136 |
+
"name":"Gemini 2.5 Flash Preview",
|
137 |
+
"provider_name":"Google",
|
138 |
+
"cost":0.6,
|
139 |
+
"hf_id":null,
|
140 |
+
"size":null,
|
141 |
+
"type":"Commercial",
|
142 |
+
"license":null,
|
143 |
+
"creation_date":1744848000000
|
144 |
+
},
|
145 |
+
{
|
146 |
+
"id":"google\/gemini-2.0-flash-lite-001",
|
147 |
+
"name":"Gemini 2.0 Flash Lite",
|
148 |
+
"provider_name":"Google",
|
149 |
+
"cost":0.3,
|
150 |
+
"hf_id":null,
|
151 |
+
"size":null,
|
152 |
+
"type":"Commercial",
|
153 |
+
"license":null,
|
154 |
+
"creation_date":1740441600000
|
155 |
+
},
|
156 |
+
{
|
157 |
+
"id":"google\/gemma-3-27b-it",
|
158 |
+
"name":"Gemma 3 27B (free)",
|
159 |
+
"provider_name":"Google",
|
160 |
+
"cost":0.0,
|
161 |
+
"hf_id":"google\/gemma-3-27b-it",
|
162 |
+
"size":27432406640.0,
|
163 |
+
"type":"Open",
|
164 |
+
"license":"Gemma",
|
165 |
+
"creation_date":1740787200000
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"id":"deepseek\/deepseek-chat-v3-0324",
|
169 |
+
"name":"DeepSeek V3 0324 (free)",
|
170 |
+
"provider_name":"DeepSeek",
|
171 |
+
"cost":0.0,
|
172 |
+
"hf_id":"deepseek-ai\/DeepSeek-V3-0324",
|
173 |
+
"size":684531386000.0,
|
174 |
+
"type":"Open",
|
175 |
+
"license":"Mit",
|
176 |
+
"creation_date":1742774400000
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"id":"deepseek\/deepseek-chat",
|
180 |
+
"name":"DeepSeek V3 (free)",
|
181 |
+
"provider_name":"DeepSeek",
|
182 |
+
"cost":0.0,
|
183 |
+
"hf_id":"deepseek-ai\/DeepSeek-V3",
|
184 |
+
"size":684531386000.0,
|
185 |
+
"type":"Open",
|
186 |
+
"license":"",
|
187 |
+
"creation_date":1735084800000
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"id":"microsoft\/phi-4",
|
191 |
+
"name":"Phi 4",
|
192 |
+
"provider_name":"Microsoft",
|
193 |
+
"cost":0.14,
|
194 |
+
"hf_id":"microsoft\/phi-4",
|
195 |
+
"size":14659507200.0,
|
196 |
+
"type":"Open",
|
197 |
+
"license":"Mit",
|
198 |
+
"creation_date":1733875200000
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"id":"microsoft\/phi-4-multimodal-instruct",
|
202 |
+
"name":"Phi 4 Multimodal Instruct",
|
203 |
+
"provider_name":"Microsoft",
|
204 |
+
"cost":0.1,
|
205 |
+
"hf_id":"microsoft\/Phi-4-multimodal-instruct",
|
206 |
+
"size":5574460384.0,
|
207 |
+
"type":"Open",
|
208 |
+
"license":"Mit",
|
209 |
+
"creation_date":1740355200000
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"id":"amazon\/nova-micro-v1",
|
213 |
+
"name":"Nova Micro 1.0",
|
214 |
+
"provider_name":"Amazon",
|
215 |
+
"cost":0.14,
|
216 |
+
"hf_id":null,
|
217 |
+
"size":null,
|
218 |
+
"type":"Commercial",
|
219 |
+
"license":null,
|
220 |
+
"creation_date":1733356800000
|
221 |
+
}
|
222 |
+
]
|
results.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|