|
import json |
|
import os |
|
|
|
import numpy as np |
|
import pandas as pd |
|
import uvicorn |
|
from countries import make_country_table |
|
from fastapi import FastAPI, Request |
|
from fastapi.middleware.cors import CORSMiddleware |
|
from fastapi.middleware.gzip import GZipMiddleware |
|
from fastapi.responses import JSONResponse |
|
from fastapi.staticfiles import StaticFiles |
|
|
|
with open("results.json", "r") as f: |
|
results = json.load(f) |
|
scores = pd.DataFrame(results["scores"]) |
|
languages = pd.DataFrame(results["languages"]) |
|
models = pd.DataFrame(results["models"]) |
|
|
|
|
|
def mean(lst): |
|
return sum(lst) / len(lst) if lst else None |
|
|
|
|
|
task_metrics = ["translation_from_bleu", "translation_to_bleu", "classification_accuracy", "mmlu_accuracy"] |
|
|
|
|
|
def make_model_table(df, models): |
|
df = ( |
|
df.groupby(["model", "task", "metric"]) |
|
.agg({"score": "mean", "bcp_47": "nunique"}) |
|
.reset_index() |
|
) |
|
df["task_metric"] = df["task"] + "_" + df["metric"] |
|
df = df.drop(columns=["task", "metric"]) |
|
df = df.pivot(index="model", columns="task_metric", values="score").fillna(0) |
|
df["average"] = df[task_metrics].mean(axis=1) |
|
df = df.sort_values(by="average", ascending=False).reset_index() |
|
df = pd.merge(df, models, left_on="model", right_on="id", how="left") |
|
df["rank"] = df.index + 1 |
|
df = df[ |
|
[ |
|
"rank", |
|
"model", |
|
"name", |
|
"provider_name", |
|
"hf_id", |
|
"creation_date", |
|
"size", |
|
"type", |
|
"license", |
|
"cost", |
|
"average", |
|
*task_metrics, |
|
] |
|
] |
|
return df |
|
|
|
|
|
def make_language_table(df, languages): |
|
df = ( |
|
df.groupby(["bcp_47", "task", "metric"]) |
|
.agg({"score": "mean", "model": "nunique"}) |
|
.reset_index() |
|
) |
|
df["task_metric"] = df["task"] + "_" + df["metric"] |
|
df = df.drop(columns=["task", "metric"]) |
|
df = ( |
|
df.pivot(index="bcp_47", columns="task_metric", values="score") |
|
.fillna(0) |
|
.reset_index() |
|
) |
|
df["average"] = df[task_metrics].mean(axis=1) |
|
df = pd.merge(languages, df, on="bcp_47", how="outer") |
|
df = df.sort_values(by="speakers", ascending=False) |
|
df = df[ |
|
[ |
|
"bcp_47", |
|
"language_name", |
|
"autonym", |
|
"speakers", |
|
"family", |
|
"average", |
|
"in_benchmark", |
|
*task_metrics, |
|
] |
|
] |
|
return df |
|
|
|
|
|
app = FastAPI() |
|
|
|
app.add_middleware(CORSMiddleware, allow_origins=["*"]) |
|
app.add_middleware(GZipMiddleware, minimum_size=1000) |
|
|
|
|
|
def serialize(df): |
|
return df.replace({np.nan: None}).to_dict(orient="records") |
|
|
|
|
|
@app.post("/api/data") |
|
async def data(request: Request): |
|
body = await request.body() |
|
data = json.loads(body) |
|
selected_languages = data.get("selectedLanguages", {}) |
|
df = scores.groupby(["model", "bcp_47", "task", "metric"]).mean().reset_index() |
|
|
|
language_table = make_language_table(df, languages) |
|
datasets_df = pd.read_json("datasets.json") |
|
if selected_languages: |
|
|
|
df = df[df["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)] |
|
if len(df) == 0: |
|
model_table = pd.DataFrame() |
|
countries = pd.DataFrame() |
|
else: |
|
model_table = make_model_table(df, models) |
|
countries = make_country_table(make_language_table(df, languages)) |
|
all_tables = { |
|
"model_table": serialize(model_table), |
|
"language_table": serialize(language_table), |
|
"dataset_table": serialize(datasets_df), |
|
"countries": serialize(countries), |
|
} |
|
return JSONResponse(content=all_tables) |
|
|
|
|
|
app.mount("/", StaticFiles(directory="frontend/build", html=True), name="frontend") |
|
|
|
if __name__ == "__main__": |
|
uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 8000))) |
|
|