UltraRonin commited on
Commit
1301ce8
·
1 Parent(s): 634c9ed
app.py CHANGED
@@ -50,7 +50,7 @@ def restart_space():
50
  # restart_space()
51
 
52
 
53
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
54
 
55
  (
56
  finished_eval_queue_df,
@@ -111,10 +111,15 @@ with demo:
111
 
112
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
113
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
114
- leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
 
115
 
116
- # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
117
- # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
118
 
119
  # with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
120
  # with gr.Column():
 
50
  # restart_space()
51
 
52
 
53
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS, task="Overall")
54
 
55
  (
56
  finished_eval_queue_df,
 
111
 
112
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
113
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
114
+ # leaderboard = init_leaderboard(LEADERBOARD_DF)
115
+ with gr.Tabs():
116
+ with gr.TabItem("Overall", elem_id="overall", id=0):
117
+ pdb.set_trace()
118
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
119
 
120
+
121
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
122
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
123
 
124
  # with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
125
  # with gr.Column():
src/about.py CHANGED
@@ -19,6 +19,19 @@ class Tasks(Enum):
19
  task4 = Task("sudoku", "EM", "Sudoku")
20
  task5 = Task("drop_quote", "EM", "Drop Quote")
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  NUM_FEWSHOT = 0 # Change with your few shot
23
  # ---------------------------------------------------
24
 
 
19
  task4 = Task("sudoku", "EM", "Sudoku")
20
  task5 = Task("drop_quote", "EM", "Drop Quote")
21
 
22
+ @dataclass
23
+ class Metric:
24
+ short: str
25
+ col_name: str
26
+
27
+ class Metrics(Enum):
28
+ CR = Metric("CR", "Completion Rate")
29
+ S_Acc = Metric("S-Acc", "Subtask Accuracy")
30
+ EM = Metric("EM", "Exact Match")
31
+ PM_05 = Metric("PM-0.5", "Partial Match (0.5)")
32
+ Tokens = Metric("Tokens", "Tokens")
33
+
34
+
35
  NUM_FEWSHOT = 0 # Change with your few shot
36
  # ---------------------------------------------------
37
 
src/display/css_html_js.py CHANGED
@@ -1,22 +1,4 @@
1
  custom_css = """
2
- /* 修改排序按钮颜色、大小等 */
3
- .gr-datatable .sorting:before,
4
- .gr-datatable .sorting:after {
5
- color: #007bff !important; /* 修改排序图标颜色 */
6
- font-size: 16px !important; /* 调整大小 */
7
- }
8
-
9
- /* 鼠标悬停时改变颜色 */
10
- .gr-datatable .sorting:hover:before,
11
- .gr-datatable .sorting:hover:after {
12
- color: #ff4500 !important; /* 悬停时变色 */
13
- }
14
-
15
- /* 激活的排序图标 */
16
- .gr-datatable .sorting_asc:before,
17
- .gr-datatable .sorting_desc:before {
18
- color: #28a745 !important; /* 绿色代表当前排序状态 */
19
- }
20
 
21
  .markdown-text {
22
  font-size: 16px !important;
 
1
  custom_css = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  .markdown-text {
4
  font-size: 16px !important;
src/display/utils.py CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- from src.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -25,13 +25,15 @@ auto_eval_column_dict = []
25
  # Init
26
  # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 
28
  # auto_eval_column_dict.append(["model", ColumnContent, field(default_factory=lambda: ColumnContent("Model", "markdown", True, never_hidden=True))])
29
  #Scores
30
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("EM ⬆️", "number", True)])
31
  # auto_eval_column_dict.append(["average", ColumnContent, field(default_factory=lambda: ColumnContent("Average ⬆️", "number", True))])
32
- for task in Tasks:
33
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
34
- # auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda task=task: ColumnContent(task.value.col_name, "number", True))])
 
35
  # Model information
36
  # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
37
  # auto_eval_column_dict.append(["model_type", ColumnContent, field(default_factory=lambda: ColumnContent("Type", "str", False))])
@@ -39,7 +41,7 @@ for task in Tasks:
39
  # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
40
  # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
41
  # auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
42
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
43
  # auto_eval_column_dict.append(["params", ColumnContent, field(default_factory=lambda: ColumnContent("#Params (B)", "number", False))])
44
  # auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
45
  # auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
 
3
 
4
  import pandas as pd
5
 
6
+ from src.about import Tasks, Metrics
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
25
  # Init
26
  # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", True)])
29
  # auto_eval_column_dict.append(["model", ColumnContent, field(default_factory=lambda: ColumnContent("Model", "markdown", True, never_hidden=True))])
30
  #Scores
31
+ # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("EM ⬆️", "number", True)])
32
  # auto_eval_column_dict.append(["average", ColumnContent, field(default_factory=lambda: ColumnContent("Average ⬆️", "number", True))])
33
+ # for task in Tasks:
34
+ # auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
35
+ for metric in Metrics:
36
+ auto_eval_column_dict.append([metric.name, ColumnContent, ColumnContent(metric.value.col_name, "number", True)])
37
  # Model information
38
  # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
39
  # auto_eval_column_dict.append(["model_type", ColumnContent, field(default_factory=lambda: ColumnContent("Type", "str", False))])
 
41
  # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
42
  # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
43
  # auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
44
+ # auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
45
  # auto_eval_column_dict.append(["params", ColumnContent, field(default_factory=lambda: ColumnContent("#Params (B)", "number", False))])
46
  # auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
47
  # auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
src/leaderboard/read_evals.py CHANGED
@@ -8,7 +8,7 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
@@ -34,7 +34,7 @@ class EvalResult:
34
  link: str = ''
35
 
36
  @classmethod
37
- def init_from_json_file(self, json_filepath):
38
  """Inits the result from the specific model result file"""
39
  with open(json_filepath) as fp:
40
  data = json.load(fp)
@@ -73,16 +73,21 @@ class EvalResult:
73
 
74
  # Extract results available in this file (some results are split in several files)
75
  results = {}
76
- for task in Tasks:
77
- task = task.value
78
 
79
- # We average all scores of a given metric (not all metrics are present in all files)
80
- accs = np.array([float(v.get(task.metric, None)) for k, v in data["results"].items() if task.benchmark == k.lower()])
81
- if accs.size == 0 or any([acc is None for acc in accs]):
82
- continue
83
 
84
- mean_acc = np.mean(accs)
85
- results[task.benchmark] = mean_acc
 
 
 
 
 
86
 
87
  return self(
88
  eval_name=result_key,
@@ -118,7 +123,7 @@ class EvalResult:
118
 
119
  def to_dict(self):
120
  """Converts the Eval Result to a dict compatible with our dataframe display"""
121
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
122
  data_dict = {
123
  "eval_name": self.eval_name, # not a column, just a save name,
124
  # AutoEvalColumn.precision.name: self.precision.value.name,
@@ -128,15 +133,17 @@ class EvalResult:
128
  # AutoEvalColumn.architecture.name: self.architecture,
129
  AutoEvalColumn.model.name: make_clickable_model(self.full_model, self.link),
130
  # AutoEvalColumn.revision.name: self.revision,
131
- AutoEvalColumn.average.name: average,
132
  # AutoEvalColumn.license.name: self.license,
133
  # AutoEvalColumn.likes.name: self.likes,
134
  AutoEvalColumn.params.name: self.num_params,
135
  # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
136
  }
137
 
138
- for task in Tasks:
139
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
 
 
140
 
141
  return data_dict
142
 
@@ -164,7 +171,7 @@ class EvalResult:
164
  # return request_file
165
 
166
 
167
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
168
  """From the path of the results folder root, extract all needed info for results"""
169
  model_result_filepaths = []
170
 
@@ -185,7 +192,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
185
  eval_results = {}
186
  for model_result_filepath in model_result_filepaths:
187
  # Creation of result
188
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
189
  # eval_result.update_with_request_file(requests_path)
190
 
191
  # Store results of same eval together
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, Metrics
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
 
34
  link: str = ''
35
 
36
  @classmethod
37
+ def init_from_json_file(self, json_filepath, task):
38
  """Inits the result from the specific model result file"""
39
  with open(json_filepath) as fp:
40
  data = json.load(fp)
 
73
 
74
  # Extract results available in this file (some results are split in several files)
75
  results = {}
76
+ # for task in Tasks:
77
+ # task = task.value
78
 
79
+ # # We average all scores of a given metric (not all metrics are present in all files)
80
+ # accs = np.array([float(v.get(task.metric, None)) for k, v in data["results"].items() if task.benchmark == k.lower()])
81
+ # if accs.size == 0 or any([acc is None for acc in accs]):
82
+ # continue
83
 
84
+ # mean_acc = np.mean(accs)
85
+ # results[task.benchmark] = mean_acc
86
+ # import pdb; pdb.set_trace()
87
+ for metric in Metrics:
88
+ metric = metric.value
89
+
90
+ results[metric.short] = data["results"][task][metric.short]
91
 
92
  return self(
93
  eval_name=result_key,
 
123
 
124
  def to_dict(self):
125
  """Converts the Eval Result to a dict compatible with our dataframe display"""
126
+ # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
127
  data_dict = {
128
  "eval_name": self.eval_name, # not a column, just a save name,
129
  # AutoEvalColumn.precision.name: self.precision.value.name,
 
133
  # AutoEvalColumn.architecture.name: self.architecture,
134
  AutoEvalColumn.model.name: make_clickable_model(self.full_model, self.link),
135
  # AutoEvalColumn.revision.name: self.revision,
136
+ # AutoEvalColumn.average.name: average,
137
  # AutoEvalColumn.license.name: self.license,
138
  # AutoEvalColumn.likes.name: self.likes,
139
  AutoEvalColumn.params.name: self.num_params,
140
  # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
141
  }
142
 
143
+ # for task in Tasks:
144
+ # data_dict[task.value.col_name] = self.results[task.value.benchmark]
145
+ for metric in Metrics:
146
+ data_dict[metric.value.col_name] = self.results[metric.value.short]
147
 
148
  return data_dict
149
 
 
171
  # return request_file
172
 
173
 
174
+ def get_raw_eval_results(results_path: str, requests_path: str, task: str) -> list[EvalResult]:
175
  """From the path of the results folder root, extract all needed info for results"""
176
  model_result_filepaths = []
177
 
 
192
  eval_results = {}
193
  for model_result_filepath in model_result_filepaths:
194
  # Creation of result
195
+ eval_result = EvalResult.init_from_json_file(model_result_filepath, task)
196
  # eval_result.update_with_request_file(requests_path)
197
 
198
  # Store results of same eval together
src/populate.py CHANGED
@@ -8,17 +8,17 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
  df = df[cols].round(decimals=2)
19
 
20
  # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
22
  return df
23
 
24
 
 
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
+ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, task) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
+ raw_data = get_raw_eval_results(results_path, requests_path, task)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
+ df = df.sort_values(by=["Exact Match"], ascending=False)
18
  df = df[cols].round(decimals=2)
19
 
20
  # filter out if any of the benchmarks have not been produced
21
+ # df = df[has_no_nan_values(df, benchmark_cols)]
22
  return df
23
 
24