David Pomerenke commited on
Commit
3ed02d5
·
1 Parent(s): 2f01096

Params and license metadata from HF API

Browse files
evals/main.py CHANGED
@@ -19,6 +19,7 @@ transcription_langs_eval_detailed = languages.iloc[:5]
19
 
20
  # ===== run evaluation and aggregate results =====
21
 
 
22
  async def evaluate():
23
  print("running evaluations")
24
  results = [
@@ -26,7 +27,7 @@ async def evaluate():
26
  for task in tasks
27
  for i in range(n_sentences)
28
  for original_language in langs_eval.itertuples()
29
- for model in models
30
  if original_language.in_benchmark
31
  and (
32
  model == model_fast
@@ -35,6 +36,7 @@ async def evaluate():
35
  ]
36
  return await tqdm_asyncio.gather(*results, miniters=1)
37
 
 
38
  def aggregate(results):
39
  results = pd.DataFrame([r for rs in results for r in rs])
40
  results = (
@@ -58,32 +60,39 @@ def aggregate(results):
58
  )
59
  return results, lang_results, model_results, task_results
60
 
 
61
  def mean(lst):
62
  return sum(lst) / len(lst) if lst else None
63
 
64
 
65
  def fmt_name(s):
66
- return " ".join(w.capitalize() for w in s.split("-")).replace("Gpt", "GPT").replace("ai", "AI")
 
 
 
 
 
67
 
68
  def serialize(df):
69
  return df.replace({np.nan: None}).to_dict(orient="records")
70
 
71
- def make_model_table(model_results):
72
- model_results["task_metric"] = model_results["task"] + "_" + model_results["metric"]
73
- model_results = model_results.drop(columns=["task", "metric"])
74
- model_table = model_results.pivot(
75
- index="model", columns="task_metric", values="score"
76
- ).fillna(0)
77
- model_table["average"] = model_table.mean(axis=1)
78
- model_table = model_table.sort_values(by="average", ascending=False)
79
- model_table = model_table.round(2).reset_index()
80
- model_table["provider"] = model_table["model"].str.split("/").str[0].apply(fmt_name)
81
- model_table["model"] = model_table["model"].str.split("/").str[1].apply(fmt_name)
82
- model_table["rank"] = model_table.index + 1
83
- model_table = model_table[
84
- ["rank", "provider", "model", "average", *model_table.columns[1:-3]]
85
- ]
86
- return model_table
 
87
 
88
 
89
  async def main():
@@ -97,7 +106,7 @@ async def main():
97
  }
98
  with open("results.json", "w") as f:
99
  json.dump(all_results, f, indent=2, ensure_ascii=False)
100
-
101
  model_table = make_model_table(model_results)
102
  all_tables = {
103
  "model_table": serialize(model_table),
 
19
 
20
  # ===== run evaluation and aggregate results =====
21
 
22
+
23
  async def evaluate():
24
  print("running evaluations")
25
  results = [
 
27
  for task in tasks
28
  for i in range(n_sentences)
29
  for original_language in langs_eval.itertuples()
30
+ for model in models["id"]
31
  if original_language.in_benchmark
32
  and (
33
  model == model_fast
 
36
  ]
37
  return await tqdm_asyncio.gather(*results, miniters=1)
38
 
39
+
40
  def aggregate(results):
41
  results = pd.DataFrame([r for rs in results for r in rs])
42
  results = (
 
60
  )
61
  return results, lang_results, model_results, task_results
62
 
63
+
64
  def mean(lst):
65
  return sum(lst) / len(lst) if lst else None
66
 
67
 
68
  def fmt_name(s):
69
+ return (
70
+ " ".join(w.capitalize() for w in s.split("-"))
71
+ .replace("Gpt", "GPT")
72
+ .replace("ai", "AI")
73
+ )
74
+
75
 
76
  def serialize(df):
77
  return df.replace({np.nan: None}).to_dict(orient="records")
78
 
79
+
80
+ def make_model_table(df):
81
+ df["task_metric"] = df["task"] + "_" + df["metric"]
82
+ df = df.drop(columns=["task", "metric"])
83
+ task_metrics = df["task_metric"].unique()
84
+ df = df.pivot(index="model", columns="task_metric", values="score").fillna(0)
85
+ df["average"] = df[task_metrics].mean(axis=1)
86
+ df = df.sort_values(by="average", ascending=False).reset_index()
87
+ for row in [*task_metrics, "average"]:
88
+ df[row] = df[row].round(2)
89
+ df = pd.merge(df, models, left_on="model", right_on="id", how="left")
90
+ df["creation_date"] = df["creation_date"].dt.strftime("%Y-%m-%d")
91
+ df["provider"] = df["model"].str.split("/").str[0].apply(fmt_name)
92
+ df["model"] = df["model"].str.split("/").str[1].apply(fmt_name)
93
+ df["rank"] = df.index + 1
94
+ df = df[["rank", "provider", "model", "hf_id", "creation_date", "size", "type", "license", "average", *task_metrics]]
95
+ return df
96
 
97
 
98
  async def main():
 
106
  }
107
  with open("results.json", "w") as f:
108
  json.dump(all_results, f, indent=2, ensure_ascii=False)
109
+
110
  model_table = make_model_table(model_results)
111
  all_tables = {
112
  "model_table": serialize(model_table),
evals/models.py CHANGED
@@ -1,11 +1,13 @@
1
  from os import getenv
2
 
 
3
  from aiolimiter import AsyncLimiter
4
  from dotenv import load_dotenv
5
  from elevenlabs import AsyncElevenLabs
6
- from huggingface_hub import AsyncInferenceClient
7
  from joblib.memory import Memory
8
  from openai import AsyncOpenAI
 
9
 
10
  # for development purposes, all languages will be evaluated on the fast models
11
  # and only a sample of languages will be evaluated on all models
@@ -80,3 +82,35 @@ async def transcribe(path, model="elevenlabs/scribe_v1"):
80
  return await transcribe_huggingface(path, model)
81
  case _:
82
  raise ValueError(f"Model {model} not supported")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from os import getenv
2
 
3
+ import pandas as pd
4
  from aiolimiter import AsyncLimiter
5
  from dotenv import load_dotenv
6
  from elevenlabs import AsyncElevenLabs
7
+ from huggingface_hub import AsyncInferenceClient, HfApi
8
  from joblib.memory import Memory
9
  from openai import AsyncOpenAI
10
+ from requests import HTTPError
11
 
12
  # for development purposes, all languages will be evaluated on the fast models
13
  # and only a sample of languages will be evaluated on all models
 
82
  return await transcribe_huggingface(path, model)
83
  case _:
84
  raise ValueError(f"Model {model} not supported")
85
+
86
+
87
+ models = pd.DataFrame(models, columns=["id"])
88
+
89
+ api = HfApi()
90
+
91
+ def get_metadata(id):
92
+ try:
93
+ info = api.model_info(id)
94
+ license = info.card_data.license.replace("_", " ").replace("mit", "MIT").title()
95
+ return {
96
+ "hf_id": info.id,
97
+ "creation_date": info.created_at,
98
+ "size": info.safetensors.total,
99
+ "type": "Open",
100
+ "license": license,
101
+ }
102
+ except HTTPError:
103
+ return {
104
+ "hf_id": None,
105
+ "creation_date": None,
106
+ "size": None,
107
+ "type": "Commercial",
108
+ "license": None,
109
+ }
110
+
111
+ models["hf_id"] = models["id"].apply(get_metadata).str["hf_id"]
112
+ models["creation_date"] = models["id"].apply(get_metadata).str["creation_date"]
113
+ models["creation_date"] = pd.to_datetime(models["creation_date"])
114
+ models["size"] = models["id"].apply(get_metadata).str["size"]
115
+ models["type"] = models["id"].apply(get_metadata).str["type"]
116
+ models["license"] = models["id"].apply(get_metadata).str["license"]
frontend/public/results.json CHANGED
@@ -4,6 +4,11 @@
4
  "rank": 1,
5
  "provider": "Google",
6
  "model": "Gemini 2.0 Flash 001",
 
 
 
 
 
7
  "average": 0.72,
8
  "classification_accuracy": 0.87,
9
  "language_modeling_chrf": 0.96,
@@ -14,6 +19,11 @@
14
  "rank": 2,
15
  "provider": "Google",
16
  "model": "Gemma 3 27b It",
 
 
 
 
 
17
  "average": 0.65,
18
  "classification_accuracy": 0.72,
19
  "language_modeling_chrf": 0.96,
@@ -24,6 +34,11 @@
24
  "rank": 3,
25
  "provider": "OpenAI",
26
  "model": "GPT 4o Mini",
 
 
 
 
 
27
  "average": 0.6,
28
  "classification_accuracy": 0.52,
29
  "language_modeling_chrf": 0.95,
@@ -34,6 +49,11 @@
34
  "rank": 4,
35
  "provider": "MistralAI",
36
  "model": "Mistral Small 24b Instruct 2501",
 
 
 
 
 
37
  "average": 0.58,
38
  "classification_accuracy": 0.55,
39
  "language_modeling_chrf": 0.86,
@@ -44,6 +64,11 @@
44
  "rank": 5,
45
  "provider": "Meta Llama",
46
  "model": "Llama 3.3 70b Instruct",
 
 
 
 
 
47
  "average": 0.56,
48
  "classification_accuracy": 0.5,
49
  "language_modeling_chrf": 0.94,
 
4
  "rank": 1,
5
  "provider": "Google",
6
  "model": "Gemini 2.0 Flash 001",
7
+ "hf_id": null,
8
+ "creation_date": null,
9
+ "size": null,
10
+ "type": "Commercial",
11
+ "license": null,
12
  "average": 0.72,
13
  "classification_accuracy": 0.87,
14
  "language_modeling_chrf": 0.96,
 
19
  "rank": 2,
20
  "provider": "Google",
21
  "model": "Gemma 3 27b It",
22
+ "hf_id": "google/gemma-3-27b-it",
23
+ "creation_date": "2025-03-01",
24
+ "size": 27432406640.0,
25
+ "type": "Open",
26
+ "license": "Gemma",
27
  "average": 0.65,
28
  "classification_accuracy": 0.72,
29
  "language_modeling_chrf": 0.96,
 
34
  "rank": 3,
35
  "provider": "OpenAI",
36
  "model": "GPT 4o Mini",
37
+ "hf_id": null,
38
+ "creation_date": null,
39
+ "size": null,
40
+ "type": "Commercial",
41
+ "license": null,
42
  "average": 0.6,
43
  "classification_accuracy": 0.52,
44
  "language_modeling_chrf": 0.95,
 
49
  "rank": 4,
50
  "provider": "MistralAI",
51
  "model": "Mistral Small 24b Instruct 2501",
52
+ "hf_id": "mistralai/Mistral-Small-24B-Instruct-2501",
53
+ "creation_date": "2025-01-28",
54
+ "size": 23572403200.0,
55
+ "type": "Open",
56
+ "license": "Apache-2.0",
57
  "average": 0.58,
58
  "classification_accuracy": 0.55,
59
  "language_modeling_chrf": 0.86,
 
64
  "rank": 5,
65
  "provider": "Meta Llama",
66
  "model": "Llama 3.3 70b Instruct",
67
+ "hf_id": "meta-llama/Llama-3.3-70B-Instruct",
68
+ "creation_date": "2024-11-26",
69
+ "size": 70553706496.0,
70
+ "type": "Open",
71
+ "license": "Llama3.3",
72
  "average": 0.56,
73
  "classification_accuracy": 0.5,
74
  "language_modeling_chrf": 0.94,
frontend/src/components/ModelTable.js CHANGED
@@ -32,15 +32,40 @@ const ModelTable = ({ data }) => {
32
  );
33
  };
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  return (
36
- <DataTable value={table} header={<>AI Models</>} sortField="average" removableSort filters={filters} filterDisplay="menu">
37
  <Column field="rank" body={rankBodyTemplate} />
38
- <Column field="provider" header="Provider" filter filterElement={providerRowFilterTemplate} showFilterMatchModes={false} />
39
- <Column field="model" header="Model" filter showFilterMatchModes={false} />
40
- <Column field="average" header="Average" sortable />
41
- <Column field="translation_chrf" header="Translation" sortable />
42
- <Column field="classification_accuracy" header="Classification" sortable />
43
- <Column field="language_modeling_chrf" header="Language Modeling" sortable />
 
 
44
  </DataTable>
45
  );
46
  };
 
32
  );
33
  };
34
 
35
+ const sizeBodyTemplate = (rowData) => {
36
+ const size = rowData.size;
37
+ if (size === null) {
38
+ return <div>N/A</div>;
39
+ }
40
+ let sizeStr;
41
+ if (size < 1000) {
42
+ sizeStr = size.toFixed(0) + "";
43
+ } else if (size < 1000 * 1000) {
44
+ sizeStr = (size / 1000).toFixed(0) + "K";
45
+ } else if (size < 1000 * 1000 * 1000) {
46
+ sizeStr = (size / 1000 / 1000).toFixed(0) + "M";
47
+ } else {
48
+ sizeStr = (size / 1000 / 1000 / 1000).toFixed(0) + "B";
49
+ }
50
+ return <div>{sizeStr}</div>;
51
+ };
52
+
53
+ const modelBodyTemplate = (rowData) => {
54
+ // bold
55
+ return <div style={{ fontWeight: 'bold' }}>{rowData.model}</div>;
56
+ };
57
+
58
  return (
59
+ <DataTable value={table} header={<>AI Models</>} sortField="average" removableSort filters={filters} filterDisplay="menu" scrollable scrollHeight="500px">
60
  <Column field="rank" body={rankBodyTemplate} />
61
+ <Column field="provider" header="Provider" filter filterElement={providerRowFilterTemplate} showFilterMatchModes={false} style={{ minWidth: '5rem' }} />
62
+ <Column field="model" header="Model" filter showFilterMatchModes={false} style={{ minWidth: '15rem' }} body={modelBodyTemplate} />
63
+ <Column field="type" header="Type" style={{ minWidth: '10rem' }} />
64
+ <Column field="size" header="Size" sortable body={sizeBodyTemplate} style={{ minWidth: '5rem' }} />
65
+ <Column field="average" header="Average" sortable style={{ minWidth: '5rem' }} />
66
+ <Column field="translation_chrf" header="Translation" sortable style={{ minWidth: '5rem' }} />
67
+ <Column field="classification_accuracy" header="Classification" sortable style={{ minWidth: '5rem' }} />
68
+ <Column field="language_modeling_chrf" header="Language Modeling" sortable style={{ minWidth: '5rem' }} />
69
  </DataTable>
70
  );
71
  };