David Pomerenke commited on
Commit
2f9dee1
·
1 Parent(s): 019cada

Only run tasks for which there is no result yet

Browse files
Files changed (7) hide show
  1. evals/backend.py +3 -5
  2. evals/datasets_/flores.py +5 -2
  3. evals/main.py +19 -18
  4. evals/tasks.py +18 -10
  5. languages.json +0 -0
  6. models.json +222 -0
  7. results.json +0 -0
evals/backend.py CHANGED
@@ -11,11 +11,9 @@ from fastapi.middleware.gzip import GZipMiddleware
11
  from fastapi.responses import JSONResponse
12
  from fastapi.staticfiles import StaticFiles
13
 
14
- with open("results.json", "r") as f:
15
- results = json.load(f)
16
- scores = pd.DataFrame(results["scores"])
17
- languages = pd.DataFrame(results["languages"])
18
- models = pd.DataFrame(results["models"])
19
 
20
 
21
  def mean(lst):
 
11
  from fastapi.responses import JSONResponse
12
  from fastapi.staticfiles import StaticFiles
13
 
14
+ scores = pd.read_json("results.json")
15
+ languages = pd.read_json("languages.json")
16
+ models = pd.read_json("models.json")
 
 
17
 
18
 
19
  def mean(lst):
evals/datasets_/flores.py CHANGED
@@ -5,8 +5,11 @@ import re
5
 
6
  flores_dir = "data/floresp-v2.0-rc.3/dev"
7
 
8
- def flores_sentences(language):
9
- return open(f"{flores_dir}/dev.{language.flores_path}").readlines()
 
 
 
10
 
11
  def aggregate_flores_paths(flores_paths):
12
  # takes a list of paths from the same language but different scripts
 
5
 
6
  flores_dir = "data/floresp-v2.0-rc.3/dev"
7
 
8
+ def flores_sentences(language) -> list[str] | None:
9
+ try:
10
+ return open(f"{flores_dir}/dev.{language.flores_path}").readlines()
11
+ except FileNotFoundError:
12
+ return None
13
 
14
  def aggregate_flores_paths(flores_paths):
15
  # takes a list of paths from the same language but different scripts
evals/main.py CHANGED
@@ -20,31 +20,32 @@ n_models = 25
20
 
21
  async def evaluate():
22
  print("running evaluations")
 
23
  results = [
24
  task(model, lang.bcp_47, i)
25
- for task in tasks
26
  for i in range(n_sentences)
27
  for lang in languages.iloc[:n_languages].itertuples()
28
  for model in models["id"].iloc[:n_models]
29
- if lang.in_benchmark # TODO
 
 
 
 
 
 
 
 
30
  ]
31
- return await tqdm_asyncio.gather(*results, miniters=1)
32
-
33
- def serialize(df):
34
- return df.replace({np.nan: None, pd.NA: None}).to_dict(orient="records")
35
-
36
- async def main():
37
- models["creation_date"] = models["creation_date"].apply(lambda x: x.isoformat())
38
- results = await evaluate()
39
  results = [r for group in results for r in group]
40
- results = {
41
- "languages": serialize(languages),
42
- "models": serialize(models),
43
- "scores": results,
44
- }
45
- with open("results.json", "w") as f:
46
- json.dump(results, f, indent=2, ensure_ascii=False)
47
 
48
 
49
  if __name__ == "__main__":
50
- asyncio.run(main())
 
20
 
21
  async def evaluate():
22
  print("running evaluations")
23
+ old_results = pd.read_json("results.json")
24
  results = [
25
  task(model, lang.bcp_47, i)
26
+ for task_name, task in tasks.items()
27
  for i in range(n_sentences)
28
  for lang in languages.iloc[:n_languages].itertuples()
29
  for model in models["id"].iloc[:n_models]
30
+ if len(
31
+ old_results[
32
+ (old_results["model"] == model)
33
+ & (old_results["bcp_47"] == lang.bcp_47)
34
+ & (old_results["task"] == task_name)
35
+ & (old_results["sentence_nr"] == i)
36
+ ]
37
+ )
38
+ == 0
39
  ]
40
+ results = await tqdm_asyncio.gather(*results, miniters=1)
 
 
 
 
 
 
 
41
  results = [r for group in results for r in group]
42
+ results = pd.DataFrame(results)
43
+ results = pd.concat([old_results, results])
44
+ args = dict(orient="records", indent=2, force_ascii=False)
45
+ results.to_json("results.json", **args)
46
+ pd.DataFrame(models).to_json("models.json", **args)
47
+ pd.DataFrame(languages).to_json("languages.json", **args)
 
48
 
49
 
50
  if __name__ == "__main__":
51
+ results = asyncio.run(evaluate())
evals/tasks.py CHANGED
@@ -33,6 +33,8 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
33
  pass
34
  case "to":
35
  original_language, target_language = target_language, original_language
 
 
36
  original_sentence = flores_sentences(original_language)[sentence_nr].strip()
37
  target_sentence = flores_sentences(target_language)[sentence_nr].strip()
38
  script = script_name(target_language.flores_path.split("_")[1])
@@ -79,7 +81,10 @@ metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
79
  @cache
80
  async def classify_and_evaluate(model, bcp_47, nr):
81
  language = languages[languages["bcp_47"] == bcp_47].iloc[0]
82
- sentences = pd.DataFrame(flores_sentences(language), columns=["text"])
 
 
 
83
  sentences = pd.concat([metadata, sentences], axis=1)
84
  sentences = sentences.dropna(subset=["topic"])
85
  sentences["topic"] = sentences["topic"].str.lower()
@@ -159,7 +164,10 @@ def corrupt_sentence(sentence):
159
  @cache
160
  async def mlm_and_evaluate(model, language_bcp_47, nr):
161
  language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
162
- sentences = pd.DataFrame(flores_sentences(language), columns=["text"])
 
 
 
163
  sentences["corrupt_text"] = sentences["text"].apply(corrupt_sentence)
164
  examples = sentences.sample(n=10, random_state=42)
165
  test_sentences = sentences[~sentences["text"].isin(examples["text"])].sample(
@@ -278,11 +286,11 @@ async def transcribe_and_evaluate(model, language_bcp_47, nr):
278
  ]
279
 
280
 
281
- tasks = [
282
- partial(translate_and_evaluate, mode="from"),
283
- partial(translate_and_evaluate, mode="to"),
284
- classify_and_evaluate,
285
- # mlm_and_evaluate,
286
- mmlu_and_evaluate,
287
- # transcribe_and_evaluate,
288
- ]
 
33
  pass
34
  case "to":
35
  original_language, target_language = target_language, original_language
36
+ if not flores_sentences(original_language) or not flores_sentences(target_language):
37
+ return []
38
  original_sentence = flores_sentences(original_language)[sentence_nr].strip()
39
  target_sentence = flores_sentences(target_language)[sentence_nr].strip()
40
  script = script_name(target_language.flores_path.split("_")[1])
 
81
  @cache
82
  async def classify_and_evaluate(model, bcp_47, nr):
83
  language = languages[languages["bcp_47"] == bcp_47].iloc[0]
84
+ sentences = flores_sentences(language)
85
+ if not sentences:
86
+ return []
87
+ sentences = pd.DataFrame(sentences, columns=["text"])
88
  sentences = pd.concat([metadata, sentences], axis=1)
89
  sentences = sentences.dropna(subset=["topic"])
90
  sentences["topic"] = sentences["topic"].str.lower()
 
164
  @cache
165
  async def mlm_and_evaluate(model, language_bcp_47, nr):
166
  language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
167
+ sentences = flores_sentences(language)
168
+ if not sentences:
169
+ return []
170
+ sentences = pd.DataFrame(sentences, columns=["text"])
171
  sentences["corrupt_text"] = sentences["text"].apply(corrupt_sentence)
172
  examples = sentences.sample(n=10, random_state=42)
173
  test_sentences = sentences[~sentences["text"].isin(examples["text"])].sample(
 
286
  ]
287
 
288
 
289
+ tasks = {
290
+ "translation_from": partial(translate_and_evaluate, mode="from"),
291
+ "translation_to": partial(translate_and_evaluate, mode="to"),
292
+ "classification": classify_and_evaluate,
293
+ # "mlm": mlm_and_evaluate,
294
+ "mmlu": mmlu_and_evaluate,
295
+ # "asr": transcribe_and_evaluate,
296
+ }
languages.json ADDED
The diff for this file is too large to render. See raw diff
 
models.json ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id":"meta-llama\/llama-4-maverick",
4
+ "name":"Llama 4 Maverick (free)",
5
+ "provider_name":"Meta",
6
+ "cost":0.0,
7
+ "hf_id":"meta-llama\/Llama-4-Maverick-17B-128E-Instruct",
8
+ "size":401583781376.0,
9
+ "type":"Open",
10
+ "license":"Other",
11
+ "creation_date":1743465600000
12
+ },
13
+ {
14
+ "id":"meta-llama\/llama-3.3-70b-instruct",
15
+ "name":"Llama 3.3 70B Instruct (free)",
16
+ "provider_name":"Meta",
17
+ "cost":0.0,
18
+ "hf_id":"meta-llama\/Llama-3.3-70B-Instruct",
19
+ "size":70553706496.0,
20
+ "type":"Open",
21
+ "license":"Llama3.3",
22
+ "creation_date":1732579200000
23
+ },
24
+ {
25
+ "id":"meta-llama\/llama-3.1-70b-instruct",
26
+ "name":"Llama 3.1 70B Instruct",
27
+ "provider_name":"Meta",
28
+ "cost":0.28,
29
+ "hf_id":"meta-llama\/Llama-3.1-70B-Instruct",
30
+ "size":70553706496.0,
31
+ "type":"Open",
32
+ "license":"Llama3.1",
33
+ "creation_date":1721088000000
34
+ },
35
+ {
36
+ "id":"meta-llama\/llama-3-70b-instruct",
37
+ "name":"Llama 3 70B Instruct",
38
+ "provider_name":"Meta",
39
+ "cost":0.4,
40
+ "hf_id":"meta-llama\/Meta-Llama-3-70B-Instruct",
41
+ "size":70553706496.0,
42
+ "type":"Open",
43
+ "license":"Llama3",
44
+ "creation_date":1713312000000
45
+ },
46
+ {
47
+ "id":"openai\/gpt-4.1-mini",
48
+ "name":"GPT-4.1 Mini",
49
+ "provider_name":"OpenAI",
50
+ "cost":1.6,
51
+ "hf_id":null,
52
+ "size":null,
53
+ "type":"Commercial",
54
+ "license":null,
55
+ "creation_date":1744588800000
56
+ },
57
+ {
58
+ "id":"openai\/gpt-4.1-nano",
59
+ "name":"GPT-4.1 Nano",
60
+ "provider_name":"OpenAI",
61
+ "cost":0.4,
62
+ "hf_id":null,
63
+ "size":null,
64
+ "type":"Commercial",
65
+ "license":null,
66
+ "creation_date":1744588800000
67
+ },
68
+ {
69
+ "id":"openai\/gpt-4o-mini",
70
+ "name":"GPT-4o-mini",
71
+ "provider_name":"OpenAI",
72
+ "cost":0.6,
73
+ "hf_id":null,
74
+ "size":null,
75
+ "type":"Commercial",
76
+ "license":null,
77
+ "creation_date":1721260800000
78
+ },
79
+ {
80
+ "id":"openai\/gpt-3.5-turbo-0613",
81
+ "name":"GPT-3.5 Turbo (older v0613)",
82
+ "provider_name":"OpenAI",
83
+ "cost":2.0,
84
+ "hf_id":null,
85
+ "size":null,
86
+ "type":"Commercial",
87
+ "license":null,
88
+ "creation_date":1706140800000
89
+ },
90
+ {
91
+ "id":"openai\/gpt-3.5-turbo",
92
+ "name":"GPT-3.5 Turbo",
93
+ "provider_name":"OpenAI",
94
+ "cost":1.5,
95
+ "hf_id":null,
96
+ "size":null,
97
+ "type":"Commercial",
98
+ "license":null,
99
+ "creation_date":1685232000000
100
+ },
101
+ {
102
+ "id":"mistralai\/mistral-small-3.1-24b-instruct",
103
+ "name":"Mistral Small 3.1 24B (free)",
104
+ "provider_name":"Mistral",
105
+ "cost":0.0,
106
+ "hf_id":"mistralai\/Mistral-Small-3.1-24B-Instruct-2503",
107
+ "size":24011361280.0,
108
+ "type":"Open",
109
+ "license":"Apache 2.0",
110
+ "creation_date":1741651200000
111
+ },
112
+ {
113
+ "id":"mistralai\/mistral-saba",
114
+ "name":"Saba",
115
+ "provider_name":"Mistral",
116
+ "cost":0.6,
117
+ "hf_id":null,
118
+ "size":null,
119
+ "type":"Commercial",
120
+ "license":null,
121
+ "creation_date":1739750400000
122
+ },
123
+ {
124
+ "id":"mistralai\/mistral-nemo",
125
+ "name":"Mistral Nemo (free)",
126
+ "provider_name":"Mistral",
127
+ "cost":0.0,
128
+ "hf_id":"mistralai\/Mistral-Nemo-Instruct-2407",
129
+ "size":12247782400.0,
130
+ "type":"Open",
131
+ "license":"Apache 2.0",
132
+ "creation_date":1721174400000
133
+ },
134
+ {
135
+ "id":"google\/gemini-2.5-flash-preview",
136
+ "name":"Gemini 2.5 Flash Preview",
137
+ "provider_name":"Google",
138
+ "cost":0.6,
139
+ "hf_id":null,
140
+ "size":null,
141
+ "type":"Commercial",
142
+ "license":null,
143
+ "creation_date":1744848000000
144
+ },
145
+ {
146
+ "id":"google\/gemini-2.0-flash-lite-001",
147
+ "name":"Gemini 2.0 Flash Lite",
148
+ "provider_name":"Google",
149
+ "cost":0.3,
150
+ "hf_id":null,
151
+ "size":null,
152
+ "type":"Commercial",
153
+ "license":null,
154
+ "creation_date":1740441600000
155
+ },
156
+ {
157
+ "id":"google\/gemma-3-27b-it",
158
+ "name":"Gemma 3 27B (free)",
159
+ "provider_name":"Google",
160
+ "cost":0.0,
161
+ "hf_id":"google\/gemma-3-27b-it",
162
+ "size":27432406640.0,
163
+ "type":"Open",
164
+ "license":"Gemma",
165
+ "creation_date":1740787200000
166
+ },
167
+ {
168
+ "id":"deepseek\/deepseek-chat-v3-0324",
169
+ "name":"DeepSeek V3 0324 (free)",
170
+ "provider_name":"DeepSeek",
171
+ "cost":0.0,
172
+ "hf_id":"deepseek-ai\/DeepSeek-V3-0324",
173
+ "size":684531386000.0,
174
+ "type":"Open",
175
+ "license":"Mit",
176
+ "creation_date":1742774400000
177
+ },
178
+ {
179
+ "id":"deepseek\/deepseek-chat",
180
+ "name":"DeepSeek V3 (free)",
181
+ "provider_name":"DeepSeek",
182
+ "cost":0.0,
183
+ "hf_id":"deepseek-ai\/DeepSeek-V3",
184
+ "size":684531386000.0,
185
+ "type":"Open",
186
+ "license":"",
187
+ "creation_date":1735084800000
188
+ },
189
+ {
190
+ "id":"microsoft\/phi-4",
191
+ "name":"Phi 4",
192
+ "provider_name":"Microsoft",
193
+ "cost":0.14,
194
+ "hf_id":"microsoft\/phi-4",
195
+ "size":14659507200.0,
196
+ "type":"Open",
197
+ "license":"Mit",
198
+ "creation_date":1733875200000
199
+ },
200
+ {
201
+ "id":"microsoft\/phi-4-multimodal-instruct",
202
+ "name":"Phi 4 Multimodal Instruct",
203
+ "provider_name":"Microsoft",
204
+ "cost":0.1,
205
+ "hf_id":"microsoft\/Phi-4-multimodal-instruct",
206
+ "size":5574460384.0,
207
+ "type":"Open",
208
+ "license":"Mit",
209
+ "creation_date":1740355200000
210
+ },
211
+ {
212
+ "id":"amazon\/nova-micro-v1",
213
+ "name":"Nova Micro 1.0",
214
+ "provider_name":"Amazon",
215
+ "cost":0.14,
216
+ "hf_id":null,
217
+ "size":null,
218
+ "type":"Commercial",
219
+ "license":null,
220
+ "creation_date":1733356800000
221
+ }
222
+ ]
results.json CHANGED
The diff for this file is too large to render. See raw diff