UltraRonin commited on
Commit
e38dcf1
·
1 Parent(s): 5733774
app.py CHANGED
@@ -28,29 +28,31 @@ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REP
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
30
 
 
31
 
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
  ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
 
51
 
52
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
 
 
54
  (
55
  finished_eval_queue_df,
56
  running_eval_queue_df,
@@ -63,28 +65,28 @@ def init_leaderboard(dataframe):
63
  return Leaderboard(
64
  value=dataframe,
65
  datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
  interactive=False,
89
  )
90
 
@@ -98,95 +100,95 @@ with demo:
98
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
  leaderboard = init_leaderboard(LEADERBOARD_DF)
100
 
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
 
191
  with gr.Row():
192
  with gr.Accordion("📙 Citation", open=False):
 
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
30
 
31
+ import pdb
32
 
33
  def restart_space():
34
  API.restart_space(repo_id=REPO_ID)
35
 
36
  ### Space initialisation
37
+ # try:
38
+ # print(EVAL_REQUESTS_PATH)
39
+ # snapshot_download(
40
+ # repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
41
+ # )
42
+ # except Exception:
43
+ # restart_space()
44
+ # try:
45
+ # print(EVAL_RESULTS_PATH)
46
+ # snapshot_download(
47
+ # repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
48
+ # )
49
+ # except Exception:
50
+ # restart_space()
51
 
52
 
53
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
54
 
55
+ pdb.set_trace()
56
  (
57
  finished_eval_queue_df,
58
  running_eval_queue_df,
 
65
  return Leaderboard(
66
  value=dataframe,
67
  datatype=[c.type for c in fields(AutoEvalColumn)],
68
+ # select_columns=SelectColumns(
69
+ # default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
70
+ # cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
71
+ # label="Select Columns to Display:",
72
+ # ),
73
+ # search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
74
+ # hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
75
+ # filter_columns=[
76
+ # ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
77
+ # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
78
+ # ColumnFilter(
79
+ # AutoEvalColumn.params.name,
80
+ # type="slider",
81
+ # min=0.01,
82
+ # max=150,
83
+ # label="Select the number of parameters (B)",
84
+ # ),
85
+ # ColumnFilter(
86
+ # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
87
+ # ),
88
+ # ],
89
+ # bool_checkboxgroup_label="Hide models",
90
  interactive=False,
91
  )
92
 
 
100
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
101
  leaderboard = init_leaderboard(LEADERBOARD_DF)
102
 
103
+ # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
104
+ # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
105
+
106
+ # with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
107
+ # with gr.Column():
108
+ # with gr.Row():
109
+ # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
110
+
111
+ # with gr.Column():
112
+ # with gr.Accordion(
113
+ # f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
114
+ # open=False,
115
+ # ):
116
+ # with gr.Row():
117
+ # finished_eval_table = gr.components.Dataframe(
118
+ # value=finished_eval_queue_df,
119
+ # headers=EVAL_COLS,
120
+ # datatype=EVAL_TYPES,
121
+ # row_count=5,
122
+ # )
123
+ # with gr.Accordion(
124
+ # f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
125
+ # open=False,
126
+ # ):
127
+ # with gr.Row():
128
+ # running_eval_table = gr.components.Dataframe(
129
+ # value=running_eval_queue_df,
130
+ # headers=EVAL_COLS,
131
+ # datatype=EVAL_TYPES,
132
+ # row_count=5,
133
+ # )
134
+
135
+ # with gr.Accordion(
136
+ # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
137
+ # open=False,
138
+ # ):
139
+ # with gr.Row():
140
+ # pending_eval_table = gr.components.Dataframe(
141
+ # value=pending_eval_queue_df,
142
+ # headers=EVAL_COLS,
143
+ # datatype=EVAL_TYPES,
144
+ # row_count=5,
145
+ # )
146
+ # with gr.Row():
147
+ # gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
148
+
149
+ # with gr.Row():
150
+ # with gr.Column():
151
+ # model_name_textbox = gr.Textbox(label="Model name")
152
+ # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
153
+ # model_type = gr.Dropdown(
154
+ # choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
155
+ # label="Model type",
156
+ # multiselect=False,
157
+ # value=None,
158
+ # interactive=True,
159
+ # )
160
+
161
+ # with gr.Column():
162
+ # precision = gr.Dropdown(
163
+ # choices=[i.value.name for i in Precision if i != Precision.Unknown],
164
+ # label="Precision",
165
+ # multiselect=False,
166
+ # value="float16",
167
+ # interactive=True,
168
+ # )
169
+ # weight_type = gr.Dropdown(
170
+ # choices=[i.value.name for i in WeightType],
171
+ # label="Weights type",
172
+ # multiselect=False,
173
+ # value="Original",
174
+ # interactive=True,
175
+ # )
176
+ # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
177
+
178
+ # submit_button = gr.Button("Submit Eval")
179
+ # submission_result = gr.Markdown()
180
+ # submit_button.click(
181
+ # add_new_eval,
182
+ # [
183
+ # model_name_textbox,
184
+ # base_model_name_textbox,
185
+ # revision_name_textbox,
186
+ # precision,
187
+ # weight_type,
188
+ # model_type,
189
+ # ],
190
+ # submission_result,
191
+ # )
192
 
193
  with gr.Row():
194
  with gr.Accordion("📙 Citation", open=False):
src/about.py CHANGED
@@ -12,8 +12,12 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
 
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
@@ -21,11 +25,11 @@ NUM_FEWSHOT = 0 # Change with your few shot
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("acrostic", "EM", "Acrostic")
16
+ task1 = Task("crossword", "EM", "Crossword")
17
+ task2 = Task("cryptogram", "EM", "Cryptogram")
18
+ task3 = Task("logic_puzzle", "EM", "Logic Puzzle")
19
+ task4 = Task("sudoku", "EM", "Sudoku")
20
+ task5 = Task("drop_quote", "EM", "Drop Quote")
21
 
22
  NUM_FEWSHOT = 0 # Change with your few shot
23
  # ---------------------------------------------------
 
25
 
26
 
27
  # Your leaderboard name
28
+ TITLE = """<h1 align="center" id="space-title">LR<sup>2</sup>Bench: Evaluating Long-chain Reflective Reasoning Capabilities of Large Language Models via Constraint Satisfaction Problems</h1>"""
29
 
30
  # What does your leaderboard evaluate?
31
  INTRODUCTION_TEXT = """
32
+ <strong>LR<sup>2</sup>Bench</strong> is a novel benchmark designed to evaluate the <strong>L</strong>ong-chain <strong>R</strong>eflective <strong>R</strong>easoning capabilities of LLMs. LR<sup>2</sup>Bench comprises 850 samples across six Constraint Satisfaction Problems (CSPs) where reflective reasoning is crucial for deriving solutions that meet all given constraints. Each type of task focuses on distinct constraint patterns, such as knowledge-based, logical, and spatial constraints, providing a comprehensive evaluation of diverse problem-solving scenarios.
33
  """
34
 
35
  # Which evaluations are you running? how can people reproduce what you have?
src/display/formatting.py CHANGED
@@ -2,9 +2,13 @@ def model_hyperlink(link, model_name):
2
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
 
4
 
5
- def make_clickable_model(model_name):
6
- link = f"https://huggingface.co/{model_name}"
7
- return model_hyperlink(link, model_name)
 
 
 
 
8
 
9
 
10
  def styled_error(error):
 
2
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
 
4
 
5
+ def make_clickable_model(model_name, still_on_hub=False):
6
+ if still_on_hub:
7
+ link = f"https://huggingface.co/{model_name}"
8
+ return model_hyperlink(link, model_name)
9
+ else:
10
+ return f'<span>{model_name}</span>'
11
+
12
 
13
 
14
  def styled_error(error):
src/display/utils.py CHANGED
@@ -1,4 +1,4 @@
1
- from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
 
4
  import pandas as pd
@@ -23,22 +23,27 @@ class ColumnContent:
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 
28
  #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
32
  # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
 
38
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -101,6 +106,7 @@ class Precision(Enum):
101
  return Precision.Unknown
102
 
103
  # Column selection
 
104
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
105
 
106
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 
1
+ from dataclasses import dataclass, make_dataclass, field
2
  from enum import Enum
3
 
4
  import pandas as pd
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
+ # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
+ # auto_eval_column_dict.append(["model", ColumnContent, field(default_factory=lambda: ColumnContent("Model", "markdown", True, never_hidden=True))])
29
  #Scores
30
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("EM ⬆️", "number", True)])
31
+ # auto_eval_column_dict.append(["average", ColumnContent, field(default_factory=lambda: ColumnContent("Average ⬆️", "number", True))])
32
  for task in Tasks:
33
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
34
+ # auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda task=task: ColumnContent(task.value.col_name, "number", True))])
35
  # Model information
36
+ # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
37
+ # auto_eval_column_dict.append(["model_type", ColumnContent, field(default_factory=lambda: ColumnContent("Type", "str", False))])
38
+ # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
39
+ # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
40
+ # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
41
+ # auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
42
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
43
+ # auto_eval_column_dict.append(["params", ColumnContent, field(default_factory=lambda: ColumnContent("#Params (B)", "number", False))])
44
+ # auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
45
+ # auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
46
+ # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
47
 
48
  # We use make dataclass to dynamically fill the scores from Tasks
49
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
106
  return Precision.Unknown
107
 
108
  # Column selection
109
+ # import pdb; pdb.set_trace()
110
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
111
 
112
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
src/envs.py CHANGED
@@ -6,20 +6,22 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
13
  QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
- EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
- EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
 
 
24
 
25
  API = HfApi(token=TOKEN)
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "UltraRonin" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
13
  QUEUE_REPO = f"{OWNER}/requests"
14
+ RESULTS_REPO = f"{OWNER}/LR2Bench"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
+ # EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
+ EVAL_REQUESTS_PATH = "/data_jhchen/Leaderboard/Requests"
22
+ # EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
23
+ EVAL_RESULTS_PATH = "/data_jhchen/Leaderboard/LR2Bench"
24
+ # EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
25
+ # EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
26
 
27
  API = HfApi(token=TOKEN)
src/leaderboard/read_evals.py CHANGED
@@ -20,16 +20,16 @@ class EvalResult:
20
  full_model: str # org/model (path on hub)
21
  org: str
22
  model: str
23
- revision: str # commit hash, "" if main
24
  results: dict
25
- precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
- license: str = "?"
30
- likes: int = 0
31
  num_params: int = 0
32
- date: str = "" # submission date of request file
33
  still_on_hub: bool = False
34
 
35
  @classmethod
@@ -41,7 +41,7 @@ class EvalResult:
41
  config = data.get("config")
42
 
43
  # Precision
44
- precision = Precision.from_str(config.get("model_dtype"))
45
 
46
  # Get model and org
47
  org_and_model = config.get("model_name", config.get("model_args", None))
@@ -50,21 +50,23 @@ class EvalResult:
50
  if len(org_and_model) == 1:
51
  org = None
52
  model = org_and_model[0]
53
- result_key = f"{model}_{precision.value.name}"
 
54
  else:
55
  org = org_and_model[0]
56
  model = org_and_model[1]
57
- result_key = f"{org}_{model}_{precision.value.name}"
 
58
  full_model = "/".join(org_and_model)
59
 
60
  still_on_hub, _, model_config = is_model_on_hub(
61
  full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
  )
63
- architecture = "?"
64
- if model_config is not None:
65
- architectures = getattr(model_config, "architectures", None)
66
- if architectures:
67
- architecture = ";".join(architectures)
68
 
69
  # Extract results available in this file (some results are split in several files)
70
  results = {}
@@ -72,11 +74,11 @@ class EvalResult:
72
  task = task.value
73
 
74
  # We average all scores of a given metric (not all metrics are present in all files)
75
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
  continue
78
 
79
- mean_acc = np.mean(accs) * 100.0
80
  results[task.benchmark] = mean_acc
81
 
82
  return self(
@@ -85,45 +87,47 @@ class EvalResult:
85
  org=org,
86
  model=model,
87
  results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
91
- architecture=architecture
92
  )
93
 
94
- def update_with_request_file(self, requests_path):
95
- """Finds the relevant request file for the current model and updates info with it"""
96
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
-
98
- try:
99
- with open(request_file, "r") as f:
100
- request = json.load(f)
101
- self.model_type = ModelType.from_str(request.get("model_type", ""))
102
- self.weight_type = WeightType[request.get("weight_type", "Original")]
103
- self.license = request.get("license", "?")
104
- self.likes = request.get("likes", 0)
105
- self.num_params = request.get("params", 0)
106
- self.date = request.get("submitted_time", "")
107
- except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
 
 
109
 
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
112
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
115
- AutoEvalColumn.precision.name: self.precision.value.name,
116
- AutoEvalColumn.model_type.name: self.model_type.value.name,
117
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
118
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
- AutoEvalColumn.architecture.name: self.architecture,
120
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
- AutoEvalColumn.revision.name: self.revision,
122
  AutoEvalColumn.average.name: average,
123
- AutoEvalColumn.license.name: self.license,
124
- AutoEvalColumn.likes.name: self.likes,
125
  AutoEvalColumn.params.name: self.num_params,
126
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
127
  }
128
 
129
  for task in Tasks:
@@ -132,26 +136,27 @@ class EvalResult:
132
  return data_dict
133
 
134
 
135
- def get_request_file_for_model(requests_path, model_name, precision):
136
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
137
- request_files = os.path.join(
138
- requests_path,
139
- f"{model_name}_eval_request_*.json",
140
- )
141
- request_files = glob.glob(request_files)
142
-
143
- # Select correct request file (precision)
144
- request_file = ""
145
- request_files = sorted(request_files, reverse=True)
146
- for tmp_request_file in request_files:
147
- with open(tmp_request_file, "r") as f:
148
- req_content = json.load(f)
149
- if (
150
- req_content["status"] in ["FINISHED"]
151
- and req_content["precision"] == precision.split(".")[-1]
152
- ):
153
- request_file = tmp_request_file
154
- return request_file
 
155
 
156
 
157
  def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
@@ -176,7 +181,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
176
  for model_result_filepath in model_result_filepaths:
177
  # Creation of result
178
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
- eval_result.update_with_request_file(requests_path)
180
 
181
  # Store results of same eval together
182
  eval_name = eval_result.eval_name
 
20
  full_model: str # org/model (path on hub)
21
  org: str
22
  model: str
23
+ # revision: str # commit hash, "" if main
24
  results: dict
25
+ # precision: Precision = Precision.Unknown
26
+ # model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
+ # weight_type: WeightType = WeightType.Original # Original or Adapter
28
+ # architecture: str = "Unknown"
29
+ # license: str = "?"
30
+ # likes: int = 0
31
  num_params: int = 0
32
+ # date: str = "" # submission date of request file
33
  still_on_hub: bool = False
34
 
35
  @classmethod
 
41
  config = data.get("config")
42
 
43
  # Precision
44
+ # precision = Precision.from_str(config.get("model_dtype"))
45
 
46
  # Get model and org
47
  org_and_model = config.get("model_name", config.get("model_args", None))
 
50
  if len(org_and_model) == 1:
51
  org = None
52
  model = org_and_model[0]
53
+ # result_key = f"{model}_{precision.value.name}"
54
+ result_key = f"{model}"
55
  else:
56
  org = org_and_model[0]
57
  model = org_and_model[1]
58
+ # result_key = f"{org}_{model}_{precision.value.name}"
59
+ result_key = f"{org}_{model}"
60
  full_model = "/".join(org_and_model)
61
 
62
  still_on_hub, _, model_config = is_model_on_hub(
63
  full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
64
  )
65
+ # architecture = "?"
66
+ # if model_config is not None:
67
+ # architectures = getattr(model_config, "architectures", None)
68
+ # if architectures:
69
+ # architecture = ";".join(architectures)
70
 
71
  # Extract results available in this file (some results are split in several files)
72
  results = {}
 
74
  task = task.value
75
 
76
  # We average all scores of a given metric (not all metrics are present in all files)
77
+ accs = np.array([float(v.get(task.metric, None)) for k, v in data["results"].items() if task.benchmark == k.lower()])
78
  if accs.size == 0 or any([acc is None for acc in accs]):
79
  continue
80
 
81
+ mean_acc = np.mean(accs)
82
  results[task.benchmark] = mean_acc
83
 
84
  return self(
 
87
  org=org,
88
  model=model,
89
  results=results,
90
+ # precision=precision,
91
+ # revision= config.get("model_sha", ""),
92
  still_on_hub=still_on_hub,
93
+ # architecture=architecture
94
  )
95
 
96
+ # def update_with_request_file(self, requests_path):
97
+ # """Finds the relevant request file for the current model and updates info with it"""
98
+ # # request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
99
+ # request_file = get_request_file_for_model(requests_path, self.full_model)
100
+
101
+ # try:
102
+ # with open(request_file, "r") as f:
103
+ # request = json.load(f)
104
+ # # self.model_type = ModelType.from_str(request.get("model_type", ""))
105
+ # # self.weight_type = WeightType[request.get("weight_type", "Original")]
106
+ # # self.license = request.get("license", "?")
107
+ # # self.likes = request.get("likes", 0)
108
+ # self.num_params = request.get("params", 0)
109
+ # # self.date = request.get("submitted_time", "")
110
+ # except Exception:
111
+ # # print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
112
+ # print(f"Could not find request file for {self.org}/{self.model}")
113
 
114
  def to_dict(self):
115
  """Converts the Eval Result to a dict compatible with our dataframe display"""
116
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
117
  data_dict = {
118
  "eval_name": self.eval_name, # not a column, just a save name,
119
+ # AutoEvalColumn.precision.name: self.precision.value.name,
120
+ # AutoEvalColumn.model_type.name: self.model_type.value.name,
121
+ # AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
122
+ # AutoEvalColumn.weight_type.name: self.weight_type.value.name,
123
+ # AutoEvalColumn.architecture.name: self.architecture,
124
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model, self.still_on_hub),
125
+ # AutoEvalColumn.revision.name: self.revision,
126
  AutoEvalColumn.average.name: average,
127
+ # AutoEvalColumn.license.name: self.license,
128
+ # AutoEvalColumn.likes.name: self.likes,
129
  AutoEvalColumn.params.name: self.num_params,
130
+ # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
131
  }
132
 
133
  for task in Tasks:
 
136
  return data_dict
137
 
138
 
139
+ # def get_request_file_for_model(requests_path, model_name, precision):
140
+ # def get_request_file_for_model(requests_path, model_name):
141
+ # """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
142
+ # request_files = os.path.join(
143
+ # requests_path,
144
+ # f"{model_name}_eval_request_*.json",
145
+ # )
146
+ # request_files = glob.glob(request_files)
147
+
148
+ # # Select correct request file (precision)
149
+ # request_file = ""
150
+ # request_files = sorted(request_files, reverse=True)
151
+ # for tmp_request_file in request_files:
152
+ # with open(tmp_request_file, "r") as f:
153
+ # req_content = json.load(f)
154
+ # if (
155
+ # req_content["status"] in ["FINISHED"]
156
+ # # and req_content["precision"] == precision.split(".")[-1]
157
+ # ):
158
+ # request_file = tmp_request_file
159
+ # return request_file
160
 
161
 
162
  def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
 
181
  for model_result_filepath in model_result_filepaths:
182
  # Creation of result
183
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
184
+ # eval_result.update_with_request_file(requests_path)
185
 
186
  # Store results of same eval together
187
  eval_name = eval_result.eval_name
src/populate.py CHANGED
@@ -34,7 +34,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
34
  data = json.load(fp)
35
 
36
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
 
39
  all_evals.append(data)
40
  elif ".md" not in entry:
@@ -46,7 +46,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
46
  data = json.load(fp)
47
 
48
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
  all_evals.append(data)
51
 
52
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
 
34
  data = json.load(fp)
35
 
36
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
+ # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
 
39
  all_evals.append(data)
40
  elif ".md" not in entry:
 
46
  data = json.load(fp)
47
 
48
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
+ # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
  all_evals.append(data)
51
 
52
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]