Spaces:
Running
Running
Commit
·
1301ce8
1
Parent(s):
634c9ed
add
Browse files- app.py +9 -4
- src/about.py +13 -0
- src/display/css_html_js.py +0 -18
- src/display/utils.py +8 -6
- src/leaderboard/read_evals.py +23 -16
- src/populate.py +4 -4
app.py
CHANGED
@@ -50,7 +50,7 @@ def restart_space():
|
|
50 |
# restart_space()
|
51 |
|
52 |
|
53 |
-
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
54 |
|
55 |
(
|
56 |
finished_eval_queue_df,
|
@@ -111,10 +111,15 @@ with demo:
|
|
111 |
|
112 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
113 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
114 |
-
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
|
|
|
|
|
|
|
|
115 |
|
116 |
-
|
117 |
-
|
|
|
118 |
|
119 |
# with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
120 |
# with gr.Column():
|
|
|
50 |
# restart_space()
|
51 |
|
52 |
|
53 |
+
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS, task="Overall")
|
54 |
|
55 |
(
|
56 |
finished_eval_queue_df,
|
|
|
111 |
|
112 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
113 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
114 |
+
# leaderboard = init_leaderboard(LEADERBOARD_DF)
|
115 |
+
with gr.Tabs():
|
116 |
+
with gr.TabItem("Overall", elem_id="overall", id=0):
|
117 |
+
pdb.set_trace()
|
118 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
119 |
|
120 |
+
|
121 |
+
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
122 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
123 |
|
124 |
# with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
125 |
# with gr.Column():
|
src/about.py
CHANGED
@@ -19,6 +19,19 @@ class Tasks(Enum):
|
|
19 |
task4 = Task("sudoku", "EM", "Sudoku")
|
20 |
task5 = Task("drop_quote", "EM", "Drop Quote")
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
NUM_FEWSHOT = 0 # Change with your few shot
|
23 |
# ---------------------------------------------------
|
24 |
|
|
|
19 |
task4 = Task("sudoku", "EM", "Sudoku")
|
20 |
task5 = Task("drop_quote", "EM", "Drop Quote")
|
21 |
|
22 |
+
@dataclass
|
23 |
+
class Metric:
|
24 |
+
short: str
|
25 |
+
col_name: str
|
26 |
+
|
27 |
+
class Metrics(Enum):
|
28 |
+
CR = Metric("CR", "Completion Rate")
|
29 |
+
S_Acc = Metric("S-Acc", "Subtask Accuracy")
|
30 |
+
EM = Metric("EM", "Exact Match")
|
31 |
+
PM_05 = Metric("PM-0.5", "Partial Match (0.5)")
|
32 |
+
Tokens = Metric("Tokens", "Tokens")
|
33 |
+
|
34 |
+
|
35 |
NUM_FEWSHOT = 0 # Change with your few shot
|
36 |
# ---------------------------------------------------
|
37 |
|
src/display/css_html_js.py
CHANGED
@@ -1,22 +1,4 @@
|
|
1 |
custom_css = """
|
2 |
-
/* 修改排序按钮颜色、大小等 */
|
3 |
-
.gr-datatable .sorting:before,
|
4 |
-
.gr-datatable .sorting:after {
|
5 |
-
color: #007bff !important; /* 修改排序图标颜色 */
|
6 |
-
font-size: 16px !important; /* 调整大小 */
|
7 |
-
}
|
8 |
-
|
9 |
-
/* 鼠标悬停时改变颜色 */
|
10 |
-
.gr-datatable .sorting:hover:before,
|
11 |
-
.gr-datatable .sorting:hover:after {
|
12 |
-
color: #ff4500 !important; /* 悬停时变色 */
|
13 |
-
}
|
14 |
-
|
15 |
-
/* 激活的排序图标 */
|
16 |
-
.gr-datatable .sorting_asc:before,
|
17 |
-
.gr-datatable .sorting_desc:before {
|
18 |
-
color: #28a745 !important; /* 绿色代表当前排序状态 */
|
19 |
-
}
|
20 |
|
21 |
.markdown-text {
|
22 |
font-size: 16px !important;
|
|
|
1 |
custom_css = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
.markdown-text {
|
4 |
font-size: 16px !important;
|
src/display/utils.py
CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.about import Tasks
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
@@ -25,13 +25,15 @@ auto_eval_column_dict = []
|
|
25 |
# Init
|
26 |
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
|
|
28 |
# auto_eval_column_dict.append(["model", ColumnContent, field(default_factory=lambda: ColumnContent("Model", "markdown", True, never_hidden=True))])
|
29 |
#Scores
|
30 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("EM ⬆️", "number", True)])
|
31 |
# auto_eval_column_dict.append(["average", ColumnContent, field(default_factory=lambda: ColumnContent("Average ⬆️", "number", True))])
|
32 |
-
for task in Tasks:
|
33 |
-
|
34 |
-
|
|
|
35 |
# Model information
|
36 |
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
37 |
# auto_eval_column_dict.append(["model_type", ColumnContent, field(default_factory=lambda: ColumnContent("Type", "str", False))])
|
@@ -39,7 +41,7 @@ for task in Tasks:
|
|
39 |
# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
40 |
# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
41 |
# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
42 |
-
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
43 |
# auto_eval_column_dict.append(["params", ColumnContent, field(default_factory=lambda: ColumnContent("#Params (B)", "number", False))])
|
44 |
# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
45 |
# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.about import Tasks, Metrics
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
25 |
# Init
|
26 |
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
+
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", True)])
|
29 |
# auto_eval_column_dict.append(["model", ColumnContent, field(default_factory=lambda: ColumnContent("Model", "markdown", True, never_hidden=True))])
|
30 |
#Scores
|
31 |
+
# auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("EM ⬆️", "number", True)])
|
32 |
# auto_eval_column_dict.append(["average", ColumnContent, field(default_factory=lambda: ColumnContent("Average ⬆️", "number", True))])
|
33 |
+
# for task in Tasks:
|
34 |
+
# auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
35 |
+
for metric in Metrics:
|
36 |
+
auto_eval_column_dict.append([metric.name, ColumnContent, ColumnContent(metric.value.col_name, "number", True)])
|
37 |
# Model information
|
38 |
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
39 |
# auto_eval_column_dict.append(["model_type", ColumnContent, field(default_factory=lambda: ColumnContent("Type", "str", False))])
|
|
|
41 |
# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
42 |
# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
43 |
# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
44 |
+
# auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
45 |
# auto_eval_column_dict.append(["params", ColumnContent, field(default_factory=lambda: ColumnContent("#Params (B)", "number", False))])
|
46 |
# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
47 |
# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
src/leaderboard/read_evals.py
CHANGED
@@ -8,7 +8,7 @@ import dateutil
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
@@ -34,7 +34,7 @@ class EvalResult:
|
|
34 |
link: str = ''
|
35 |
|
36 |
@classmethod
|
37 |
-
def init_from_json_file(self, json_filepath):
|
38 |
"""Inits the result from the specific model result file"""
|
39 |
with open(json_filepath) as fp:
|
40 |
data = json.load(fp)
|
@@ -73,16 +73,21 @@ class EvalResult:
|
|
73 |
|
74 |
# Extract results available in this file (some results are split in several files)
|
75 |
results = {}
|
76 |
-
for task in Tasks:
|
77 |
-
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
return self(
|
88 |
eval_name=result_key,
|
@@ -118,7 +123,7 @@ class EvalResult:
|
|
118 |
|
119 |
def to_dict(self):
|
120 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
121 |
-
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
122 |
data_dict = {
|
123 |
"eval_name": self.eval_name, # not a column, just a save name,
|
124 |
# AutoEvalColumn.precision.name: self.precision.value.name,
|
@@ -128,15 +133,17 @@ class EvalResult:
|
|
128 |
# AutoEvalColumn.architecture.name: self.architecture,
|
129 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model, self.link),
|
130 |
# AutoEvalColumn.revision.name: self.revision,
|
131 |
-
AutoEvalColumn.average.name: average,
|
132 |
# AutoEvalColumn.license.name: self.license,
|
133 |
# AutoEvalColumn.likes.name: self.likes,
|
134 |
AutoEvalColumn.params.name: self.num_params,
|
135 |
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
136 |
}
|
137 |
|
138 |
-
for task in Tasks:
|
139 |
-
|
|
|
|
|
140 |
|
141 |
return data_dict
|
142 |
|
@@ -164,7 +171,7 @@ class EvalResult:
|
|
164 |
# return request_file
|
165 |
|
166 |
|
167 |
-
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
168 |
"""From the path of the results folder root, extract all needed info for results"""
|
169 |
model_result_filepaths = []
|
170 |
|
@@ -185,7 +192,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
185 |
eval_results = {}
|
186 |
for model_result_filepath in model_result_filepaths:
|
187 |
# Creation of result
|
188 |
-
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
189 |
# eval_result.update_with_request_file(requests_path)
|
190 |
|
191 |
# Store results of same eval together
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, Metrics
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
|
|
34 |
link: str = ''
|
35 |
|
36 |
@classmethod
|
37 |
+
def init_from_json_file(self, json_filepath, task):
|
38 |
"""Inits the result from the specific model result file"""
|
39 |
with open(json_filepath) as fp:
|
40 |
data = json.load(fp)
|
|
|
73 |
|
74 |
# Extract results available in this file (some results are split in several files)
|
75 |
results = {}
|
76 |
+
# for task in Tasks:
|
77 |
+
# task = task.value
|
78 |
|
79 |
+
# # We average all scores of a given metric (not all metrics are present in all files)
|
80 |
+
# accs = np.array([float(v.get(task.metric, None)) for k, v in data["results"].items() if task.benchmark == k.lower()])
|
81 |
+
# if accs.size == 0 or any([acc is None for acc in accs]):
|
82 |
+
# continue
|
83 |
|
84 |
+
# mean_acc = np.mean(accs)
|
85 |
+
# results[task.benchmark] = mean_acc
|
86 |
+
# import pdb; pdb.set_trace()
|
87 |
+
for metric in Metrics:
|
88 |
+
metric = metric.value
|
89 |
+
|
90 |
+
results[metric.short] = data["results"][task][metric.short]
|
91 |
|
92 |
return self(
|
93 |
eval_name=result_key,
|
|
|
123 |
|
124 |
def to_dict(self):
|
125 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
126 |
+
# average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
127 |
data_dict = {
|
128 |
"eval_name": self.eval_name, # not a column, just a save name,
|
129 |
# AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
133 |
# AutoEvalColumn.architecture.name: self.architecture,
|
134 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model, self.link),
|
135 |
# AutoEvalColumn.revision.name: self.revision,
|
136 |
+
# AutoEvalColumn.average.name: average,
|
137 |
# AutoEvalColumn.license.name: self.license,
|
138 |
# AutoEvalColumn.likes.name: self.likes,
|
139 |
AutoEvalColumn.params.name: self.num_params,
|
140 |
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
141 |
}
|
142 |
|
143 |
+
# for task in Tasks:
|
144 |
+
# data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
145 |
+
for metric in Metrics:
|
146 |
+
data_dict[metric.value.col_name] = self.results[metric.value.short]
|
147 |
|
148 |
return data_dict
|
149 |
|
|
|
171 |
# return request_file
|
172 |
|
173 |
|
174 |
+
def get_raw_eval_results(results_path: str, requests_path: str, task: str) -> list[EvalResult]:
|
175 |
"""From the path of the results folder root, extract all needed info for results"""
|
176 |
model_result_filepaths = []
|
177 |
|
|
|
192 |
eval_results = {}
|
193 |
for model_result_filepath in model_result_filepaths:
|
194 |
# Creation of result
|
195 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath, task)
|
196 |
# eval_result.update_with_request_file(requests_path)
|
197 |
|
198 |
# Store results of same eval together
|
src/populate.py
CHANGED
@@ -8,17 +8,17 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
-
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
-
df = df.sort_values(by=[
|
18 |
df = df[cols].round(decimals=2)
|
19 |
|
20 |
# filter out if any of the benchmarks have not been produced
|
21 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
return df
|
23 |
|
24 |
|
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
+
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, task) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
+
raw_data = get_raw_eval_results(results_path, requests_path, task)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
+
df = df.sort_values(by=["Exact Match"], ascending=False)
|
18 |
df = df[cols].round(decimals=2)
|
19 |
|
20 |
# filter out if any of the benchmarks have not been produced
|
21 |
+
# df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
return df
|
23 |
|
24 |
|