Spaces:

fair-forward
/

evals-for-every-language

Running

App Files Files Community

David Pomerenke commited on 8 days ago

Commit

2f9dee1

1 Parent(s): 019cada

Only run tasks for which there is no result yet

Browse files

Files changed (7) hide show

evals/backend.py +3 -5
evals/datasets_/flores.py +5 -2
evals/main.py +19 -18
evals/tasks.py +18 -10
languages.json +0 -0
models.json +222 -0
results.json +0 -0

evals/backend.py CHANGED Viewed

@@ -11,11 +11,9 @@ from fastapi.middleware.gzip import GZipMiddleware
 from fastapi.responses import JSONResponse
 from fastapi.staticfiles import StaticFiles
-with open("results.json", "r") as f:
-    results = json.load(f)
-scores = pd.DataFrame(results["scores"])
-languages = pd.DataFrame(results["languages"])
-models = pd.DataFrame(results["models"])
 def mean(lst):

 from fastapi.responses import JSONResponse
 from fastapi.staticfiles import StaticFiles
+scores = pd.read_json("results.json")
+languages = pd.read_json("languages.json")
+models = pd.read_json("models.json")
 def mean(lst):

evals/datasets_/flores.py CHANGED Viewed

@@ -5,8 +5,11 @@ import re
 flores_dir = "data/floresp-v2.0-rc.3/dev"
-def flores_sentences(language):
-    return open(f"{flores_dir}/dev.{language.flores_path}").readlines()
 def aggregate_flores_paths(flores_paths):
     # takes a list of paths from the same language but different scripts

 flores_dir = "data/floresp-v2.0-rc.3/dev"
+def flores_sentences(language) -> list[str] | None:
+    try:
+        return open(f"{flores_dir}/dev.{language.flores_path}").readlines()
+    except FileNotFoundError:
+        return None
 def aggregate_flores_paths(flores_paths):
     # takes a list of paths from the same language but different scripts

evals/main.py CHANGED Viewed

@@ -20,31 +20,32 @@ n_models = 25
 async def evaluate():
     print("running evaluations")
     results = [
         task(model, lang.bcp_47, i)
-        for task in tasks
         for i in range(n_sentences)
         for lang in languages.iloc[:n_languages].itertuples()
         for model in models["id"].iloc[:n_models]
-        if lang.in_benchmark # TODO
     ]
-    return await tqdm_asyncio.gather(*results, miniters=1)
-def serialize(df):
-    return df.replace({np.nan: None, pd.NA: None}).to_dict(orient="records")
-async def main():
-    models["creation_date"] = models["creation_date"].apply(lambda x: x.isoformat())
-    results = await evaluate()
     results = [r for group in results for r in group]
-    results = {
-        "languages": serialize(languages),
-        "models": serialize(models),
-        "scores": results,
-    }
-    with open("results.json", "w") as f:
-        json.dump(results, f, indent=2, ensure_ascii=False)
 if __name__ == "__main__":
-    asyncio.run(main())

 async def evaluate():
     print("running evaluations")
+    old_results = pd.read_json("results.json")
     results = [
         task(model, lang.bcp_47, i)
+        for task_name, task in tasks.items()
         for i in range(n_sentences)
         for lang in languages.iloc[:n_languages].itertuples()
         for model in models["id"].iloc[:n_models]
+        if len(
+            old_results[
+                (old_results["model"] == model)
+                & (old_results["bcp_47"] == lang.bcp_47)
+                & (old_results["task"] == task_name)
+                & (old_results["sentence_nr"] == i)
+            ]
+        )
+        == 0
     ]
+    results = await tqdm_asyncio.gather(*results, miniters=1)
     results = [r for group in results for r in group]
+    results = pd.DataFrame(results)
+    results = pd.concat([old_results, results])
+    args = dict(orient="records", indent=2, force_ascii=False)
+    results.to_json("results.json", **args)
+    pd.DataFrame(models).to_json("models.json", **args)
+    pd.DataFrame(languages).to_json("languages.json", **args)
 if __name__ == "__main__":
+    results = asyncio.run(evaluate())

evals/tasks.py CHANGED Viewed

@@ -33,6 +33,8 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
             pass
         case "to":
             original_language, target_language = target_language, original_language
     original_sentence = flores_sentences(original_language)[sentence_nr].strip()
     target_sentence = flores_sentences(target_language)[sentence_nr].strip()
     script = script_name(target_language.flores_path.split("_")[1])
@@ -79,7 +81,10 @@ metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
 @cache
 async def classify_and_evaluate(model, bcp_47, nr):
     language = languages[languages["bcp_47"] == bcp_47].iloc[0]
-    sentences = pd.DataFrame(flores_sentences(language), columns=["text"])
     sentences = pd.concat([metadata, sentences], axis=1)
     sentences = sentences.dropna(subset=["topic"])
     sentences["topic"] = sentences["topic"].str.lower()
@@ -159,7 +164,10 @@ def corrupt_sentence(sentence):
 @cache
 async def mlm_and_evaluate(model, language_bcp_47, nr):
     language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
-    sentences = pd.DataFrame(flores_sentences(language), columns=["text"])
     sentences["corrupt_text"] = sentences["text"].apply(corrupt_sentence)
     examples = sentences.sample(n=10, random_state=42)
     test_sentences = sentences[~sentences["text"].isin(examples["text"])].sample(
@@ -278,11 +286,11 @@ async def transcribe_and_evaluate(model, language_bcp_47, nr):
     ]
-tasks = [
-    partial(translate_and_evaluate, mode="from"),
-    partial(translate_and_evaluate, mode="to"),
-    classify_and_evaluate,
-    # mlm_and_evaluate,
-    mmlu_and_evaluate,
-    # transcribe_and_evaluate,
-]

             pass
         case "to":
             original_language, target_language = target_language, original_language
+    if not flores_sentences(original_language) or not flores_sentences(target_language):
+        return []
     original_sentence = flores_sentences(original_language)[sentence_nr].strip()
     target_sentence = flores_sentences(target_language)[sentence_nr].strip()
     script = script_name(target_language.flores_path.split("_")[1])
 @cache
 async def classify_and_evaluate(model, bcp_47, nr):
     language = languages[languages["bcp_47"] == bcp_47].iloc[0]
+    sentences = flores_sentences(language)
+    if not sentences:
+        return []
+    sentences = pd.DataFrame(sentences, columns=["text"])
     sentences = pd.concat([metadata, sentences], axis=1)
     sentences = sentences.dropna(subset=["topic"])
     sentences["topic"] = sentences["topic"].str.lower()
 @cache
 async def mlm_and_evaluate(model, language_bcp_47, nr):
     language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
+    sentences = flores_sentences(language)
+    if not sentences:
+        return []
+    sentences = pd.DataFrame(sentences, columns=["text"])
     sentences["corrupt_text"] = sentences["text"].apply(corrupt_sentence)
     examples = sentences.sample(n=10, random_state=42)
     test_sentences = sentences[~sentences["text"].isin(examples["text"])].sample(
     ]
+tasks = {
+    "translation_from": partial(translate_and_evaluate, mode="from"),
+    "translation_to": partial(translate_and_evaluate, mode="to"),
+    "classification": classify_and_evaluate,
+    # "mlm": mlm_and_evaluate,
+    "mmlu": mmlu_and_evaluate,
+    # "asr": transcribe_and_evaluate,
+}

languages.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models.json ADDED Viewed

	@@ -0,0 +1,222 @@

+[
+  {
+    "id":"meta-llama\/llama-4-maverick",
+    "name":"Llama 4 Maverick (free)",
+    "provider_name":"Meta",
+    "cost":0.0,
+    "hf_id":"meta-llama\/Llama-4-Maverick-17B-128E-Instruct",
+    "size":401583781376.0,
+    "type":"Open",
+    "license":"Other",
+    "creation_date":1743465600000
+  },
+  {
+    "id":"meta-llama\/llama-3.3-70b-instruct",
+    "name":"Llama 3.3 70B Instruct (free)",
+    "provider_name":"Meta",
+    "cost":0.0,
+    "hf_id":"meta-llama\/Llama-3.3-70B-Instruct",
+    "size":70553706496.0,
+    "type":"Open",
+    "license":"Llama3.3",
+    "creation_date":1732579200000
+  },
+  {
+    "id":"meta-llama\/llama-3.1-70b-instruct",
+    "name":"Llama 3.1 70B Instruct",
+    "provider_name":"Meta",
+    "cost":0.28,
+    "hf_id":"meta-llama\/Llama-3.1-70B-Instruct",
+    "size":70553706496.0,
+    "type":"Open",
+    "license":"Llama3.1",
+    "creation_date":1721088000000
+  },
+  {
+    "id":"meta-llama\/llama-3-70b-instruct",
+    "name":"Llama 3 70B Instruct",
+    "provider_name":"Meta",
+    "cost":0.4,
+    "hf_id":"meta-llama\/Meta-Llama-3-70B-Instruct",
+    "size":70553706496.0,
+    "type":"Open",
+    "license":"Llama3",
+    "creation_date":1713312000000
+  },
+  {
+    "id":"openai\/gpt-4.1-mini",
+    "name":"GPT-4.1 Mini",
+    "provider_name":"OpenAI",
+    "cost":1.6,
+    "hf_id":null,
+    "size":null,
+    "type":"Commercial",
+    "license":null,
+    "creation_date":1744588800000
+  },
+  {
+    "id":"openai\/gpt-4.1-nano",
+    "name":"GPT-4.1 Nano",
+    "provider_name":"OpenAI",
+    "cost":0.4,
+    "hf_id":null,
+    "size":null,
+    "type":"Commercial",
+    "license":null,
+    "creation_date":1744588800000
+  },
+  {
+    "id":"openai\/gpt-4o-mini",
+    "name":"GPT-4o-mini",
+    "provider_name":"OpenAI",
+    "cost":0.6,
+    "hf_id":null,
+    "size":null,
+    "type":"Commercial",
+    "license":null,
+    "creation_date":1721260800000
+  },
+  {
+    "id":"openai\/gpt-3.5-turbo-0613",
+    "name":"GPT-3.5 Turbo (older v0613)",
+    "provider_name":"OpenAI",
+    "cost":2.0,
+    "hf_id":null,
+    "size":null,
+    "type":"Commercial",
+    "license":null,
+    "creation_date":1706140800000
+  },
+  {
+    "id":"openai\/gpt-3.5-turbo",
+    "name":"GPT-3.5 Turbo",
+    "provider_name":"OpenAI",
+    "cost":1.5,
+    "hf_id":null,
+    "size":null,
+    "type":"Commercial",
+    "license":null,
+    "creation_date":1685232000000
+  },
+  {
+    "id":"mistralai\/mistral-small-3.1-24b-instruct",
+    "name":"Mistral Small 3.1 24B (free)",
+    "provider_name":"Mistral",
+    "cost":0.0,
+    "hf_id":"mistralai\/Mistral-Small-3.1-24B-Instruct-2503",
+    "size":24011361280.0,
+    "type":"Open",
+    "license":"Apache 2.0",
+    "creation_date":1741651200000
+  },
+  {
+    "id":"mistralai\/mistral-saba",
+    "name":"Saba",
+    "provider_name":"Mistral",
+    "cost":0.6,
+    "hf_id":null,
+    "size":null,
+    "type":"Commercial",
+    "license":null,
+    "creation_date":1739750400000
+  },
+  {
+    "id":"mistralai\/mistral-nemo",
+    "name":"Mistral Nemo (free)",
+    "provider_name":"Mistral",
+    "cost":0.0,
+    "hf_id":"mistralai\/Mistral-Nemo-Instruct-2407",
+    "size":12247782400.0,
+    "type":"Open",
+    "license":"Apache 2.0",
+    "creation_date":1721174400000
+  },
+  {
+    "id":"google\/gemini-2.5-flash-preview",
+    "name":"Gemini 2.5 Flash Preview",
+    "provider_name":"Google",
+    "cost":0.6,
+    "hf_id":null,
+    "size":null,
+    "type":"Commercial",
+    "license":null,
+    "creation_date":1744848000000
+  },
+  {
+    "id":"google\/gemini-2.0-flash-lite-001",
+    "name":"Gemini 2.0 Flash Lite",
+    "provider_name":"Google",
+    "cost":0.3,
+    "hf_id":null,
+    "size":null,
+    "type":"Commercial",
+    "license":null,
+    "creation_date":1740441600000
+  },
+  {
+    "id":"google\/gemma-3-27b-it",
+    "name":"Gemma 3 27B (free)",
+    "provider_name":"Google",
+    "cost":0.0,
+    "hf_id":"google\/gemma-3-27b-it",
+    "size":27432406640.0,
+    "type":"Open",
+    "license":"Gemma",
+    "creation_date":1740787200000
+  },
+  {
+    "id":"deepseek\/deepseek-chat-v3-0324",
+    "name":"DeepSeek V3 0324 (free)",
+    "provider_name":"DeepSeek",
+    "cost":0.0,
+    "hf_id":"deepseek-ai\/DeepSeek-V3-0324",
+    "size":684531386000.0,
+    "type":"Open",
+    "license":"Mit",
+    "creation_date":1742774400000
+  },
+  {
+    "id":"deepseek\/deepseek-chat",
+    "name":"DeepSeek V3 (free)",
+    "provider_name":"DeepSeek",
+    "cost":0.0,
+    "hf_id":"deepseek-ai\/DeepSeek-V3",
+    "size":684531386000.0,
+    "type":"Open",
+    "license":"",
+    "creation_date":1735084800000
+  },
+  {
+    "id":"microsoft\/phi-4",
+    "name":"Phi 4",
+    "provider_name":"Microsoft",
+    "cost":0.14,
+    "hf_id":"microsoft\/phi-4",
+    "size":14659507200.0,
+    "type":"Open",
+    "license":"Mit",
+    "creation_date":1733875200000
+  },
+  {
+    "id":"microsoft\/phi-4-multimodal-instruct",
+    "name":"Phi 4 Multimodal Instruct",
+    "provider_name":"Microsoft",
+    "cost":0.1,
+    "hf_id":"microsoft\/Phi-4-multimodal-instruct",
+    "size":5574460384.0,
+    "type":"Open",
+    "license":"Mit",
+    "creation_date":1740355200000
+  },
+  {
+    "id":"amazon\/nova-micro-v1",
+    "name":"Nova Micro 1.0",
+    "provider_name":"Amazon",
+    "cost":0.14,
+    "hf_id":null,
+    "size":null,
+    "type":"Commercial",
+    "license":null,
+    "creation_date":1733356800000
+  }
+]

results.json CHANGED Viewed

The diff for this file is too large to render. See raw diff