import asyncio import json import numpy as np import pandas as pd from tqdm.asyncio import tqdm_asyncio from languages import languages from models import models from tasks import tasks # ===== config ===== n_sentences = 10 n_languages = 40 n_models = 25 # ===== run evaluation and aggregate results ===== async def evaluate(): print("running evaluations") results = [ task(model, lang.bcp_47, i) for task in tasks for i in range(n_sentences) for lang in languages.iloc[:n_languages].itertuples() for model in models["id"].iloc[:n_models] if lang.in_benchmark # TODO ] return await tqdm_asyncio.gather(*results, miniters=1) def serialize(df): return df.replace({np.nan: None, pd.NA: None}).to_dict(orient="records") async def main(): models["creation_date"] = models["creation_date"].apply(lambda x: x.isoformat()) results = await evaluate() results = [r for group in results for r in group] results = { "languages": serialize(languages), "models": serialize(models), "scores": results, } with open("results.json", "w") as f: json.dump(results, f, indent=2, ensure_ascii=False) if __name__ == "__main__": asyncio.run(main())