File size: 1,264 Bytes
da6e1bc
 
 
4d13673
 
 
 
da6e1bc
8274634
da6e1bc
 
 
 
8274634
260c1a3
 
da6e1bc
 
 
3ed02d5
da6e1bc
 
 
8274634
da6e1bc
 
ce2acb0
 
260c1a3
da6e1bc
 
 
4d13673
 
723f963
da6e1bc
4d13673
da6e1bc
d91b022
4d13673
 
 
 
 
da6e1bc
2c21cf7
da6e1bc
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import asyncio
import json

import numpy as np
import pandas as pd
from tqdm.asyncio import tqdm_asyncio

from languages import languages
from models import models
from tasks import tasks

# ===== config =====

n_sentences = 10
n_languages = 40
n_models = 25

# ===== run evaluation and aggregate results =====


async def evaluate():
    print("running evaluations")
    results = [
        task(model, lang.bcp_47, i)
        for task in tasks
        for i in range(n_sentences)
        for lang in languages.iloc[:n_languages].itertuples()
        for model in models["id"].iloc[:n_models]
        if lang.in_benchmark # TODO
    ]
    return await tqdm_asyncio.gather(*results, miniters=1)

def serialize(df):
    return df.replace({np.nan: None, pd.NA: None}).to_dict(orient="records")

async def main():
    models["creation_date"] = models["creation_date"].apply(lambda x: x.isoformat())
    results = await evaluate()
    results = [r for group in results for r in group]
    results = {
        "languages": serialize(languages),
        "models": serialize(models),
        "scores": results,
    }
    with open("results.json", "w") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)


if __name__ == "__main__":
    asyncio.run(main())