Spaces:
Running
Running
Commit
·
e38dcf1
1
Parent(s):
5733774
add
Browse files- app.py +127 -125
- src/about.py +8 -4
- src/display/formatting.py +7 -3
- src/display/utils.py +17 -11
- src/envs.py +8 -6
- src/leaderboard/read_evals.py +72 -67
- src/populate.py +2 -2
app.py
CHANGED
@@ -28,29 +28,31 @@ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REP
|
|
28 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
29 |
from src.submission.submit import add_new_eval
|
30 |
|
|
|
31 |
|
32 |
def restart_space():
|
33 |
API.restart_space(repo_id=REPO_ID)
|
34 |
|
35 |
### Space initialisation
|
36 |
-
try:
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
except Exception:
|
42 |
-
|
43 |
-
try:
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
except Exception:
|
49 |
-
|
50 |
|
51 |
|
52 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
53 |
|
|
|
54 |
(
|
55 |
finished_eval_queue_df,
|
56 |
running_eval_queue_df,
|
@@ -63,28 +65,28 @@ def init_leaderboard(dataframe):
|
|
63 |
return Leaderboard(
|
64 |
value=dataframe,
|
65 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
66 |
-
select_columns=SelectColumns(
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
),
|
71 |
-
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
72 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
-
filter_columns=[
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
],
|
87 |
-
bool_checkboxgroup_label="Hide models",
|
88 |
interactive=False,
|
89 |
)
|
90 |
|
@@ -98,95 +100,95 @@ with demo:
|
|
98 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
99 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
100 |
|
101 |
-
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
102 |
-
|
103 |
-
|
104 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
|
191 |
with gr.Row():
|
192 |
with gr.Accordion("📙 Citation", open=False):
|
|
|
28 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
29 |
from src.submission.submit import add_new_eval
|
30 |
|
31 |
+
import pdb
|
32 |
|
33 |
def restart_space():
|
34 |
API.restart_space(repo_id=REPO_ID)
|
35 |
|
36 |
### Space initialisation
|
37 |
+
# try:
|
38 |
+
# print(EVAL_REQUESTS_PATH)
|
39 |
+
# snapshot_download(
|
40 |
+
# repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
41 |
+
# )
|
42 |
+
# except Exception:
|
43 |
+
# restart_space()
|
44 |
+
# try:
|
45 |
+
# print(EVAL_RESULTS_PATH)
|
46 |
+
# snapshot_download(
|
47 |
+
# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
48 |
+
# )
|
49 |
+
# except Exception:
|
50 |
+
# restart_space()
|
51 |
|
52 |
|
53 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
54 |
|
55 |
+
pdb.set_trace()
|
56 |
(
|
57 |
finished_eval_queue_df,
|
58 |
running_eval_queue_df,
|
|
|
65 |
return Leaderboard(
|
66 |
value=dataframe,
|
67 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
68 |
+
# select_columns=SelectColumns(
|
69 |
+
# default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
70 |
+
# cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
71 |
+
# label="Select Columns to Display:",
|
72 |
+
# ),
|
73 |
+
# search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
74 |
+
# hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
75 |
+
# filter_columns=[
|
76 |
+
# ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
77 |
+
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
78 |
+
# ColumnFilter(
|
79 |
+
# AutoEvalColumn.params.name,
|
80 |
+
# type="slider",
|
81 |
+
# min=0.01,
|
82 |
+
# max=150,
|
83 |
+
# label="Select the number of parameters (B)",
|
84 |
+
# ),
|
85 |
+
# ColumnFilter(
|
86 |
+
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
87 |
+
# ),
|
88 |
+
# ],
|
89 |
+
# bool_checkboxgroup_label="Hide models",
|
90 |
interactive=False,
|
91 |
)
|
92 |
|
|
|
100 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
101 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
102 |
|
103 |
+
# with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
104 |
+
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
105 |
+
|
106 |
+
# with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
107 |
+
# with gr.Column():
|
108 |
+
# with gr.Row():
|
109 |
+
# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
110 |
+
|
111 |
+
# with gr.Column():
|
112 |
+
# with gr.Accordion(
|
113 |
+
# f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
114 |
+
# open=False,
|
115 |
+
# ):
|
116 |
+
# with gr.Row():
|
117 |
+
# finished_eval_table = gr.components.Dataframe(
|
118 |
+
# value=finished_eval_queue_df,
|
119 |
+
# headers=EVAL_COLS,
|
120 |
+
# datatype=EVAL_TYPES,
|
121 |
+
# row_count=5,
|
122 |
+
# )
|
123 |
+
# with gr.Accordion(
|
124 |
+
# f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
125 |
+
# open=False,
|
126 |
+
# ):
|
127 |
+
# with gr.Row():
|
128 |
+
# running_eval_table = gr.components.Dataframe(
|
129 |
+
# value=running_eval_queue_df,
|
130 |
+
# headers=EVAL_COLS,
|
131 |
+
# datatype=EVAL_TYPES,
|
132 |
+
# row_count=5,
|
133 |
+
# )
|
134 |
+
|
135 |
+
# with gr.Accordion(
|
136 |
+
# f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
137 |
+
# open=False,
|
138 |
+
# ):
|
139 |
+
# with gr.Row():
|
140 |
+
# pending_eval_table = gr.components.Dataframe(
|
141 |
+
# value=pending_eval_queue_df,
|
142 |
+
# headers=EVAL_COLS,
|
143 |
+
# datatype=EVAL_TYPES,
|
144 |
+
# row_count=5,
|
145 |
+
# )
|
146 |
+
# with gr.Row():
|
147 |
+
# gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
148 |
+
|
149 |
+
# with gr.Row():
|
150 |
+
# with gr.Column():
|
151 |
+
# model_name_textbox = gr.Textbox(label="Model name")
|
152 |
+
# revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
153 |
+
# model_type = gr.Dropdown(
|
154 |
+
# choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
155 |
+
# label="Model type",
|
156 |
+
# multiselect=False,
|
157 |
+
# value=None,
|
158 |
+
# interactive=True,
|
159 |
+
# )
|
160 |
+
|
161 |
+
# with gr.Column():
|
162 |
+
# precision = gr.Dropdown(
|
163 |
+
# choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
164 |
+
# label="Precision",
|
165 |
+
# multiselect=False,
|
166 |
+
# value="float16",
|
167 |
+
# interactive=True,
|
168 |
+
# )
|
169 |
+
# weight_type = gr.Dropdown(
|
170 |
+
# choices=[i.value.name for i in WeightType],
|
171 |
+
# label="Weights type",
|
172 |
+
# multiselect=False,
|
173 |
+
# value="Original",
|
174 |
+
# interactive=True,
|
175 |
+
# )
|
176 |
+
# base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
177 |
+
|
178 |
+
# submit_button = gr.Button("Submit Eval")
|
179 |
+
# submission_result = gr.Markdown()
|
180 |
+
# submit_button.click(
|
181 |
+
# add_new_eval,
|
182 |
+
# [
|
183 |
+
# model_name_textbox,
|
184 |
+
# base_model_name_textbox,
|
185 |
+
# revision_name_textbox,
|
186 |
+
# precision,
|
187 |
+
# weight_type,
|
188 |
+
# model_type,
|
189 |
+
# ],
|
190 |
+
# submission_result,
|
191 |
+
# )
|
192 |
|
193 |
with gr.Row():
|
194 |
with gr.Accordion("📙 Citation", open=False):
|
src/about.py
CHANGED
@@ -12,8 +12,12 @@ class Task:
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("
|
16 |
-
task1 = Task("
|
|
|
|
|
|
|
|
|
17 |
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
@@ -21,11 +25,11 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
24 |
-
TITLE = """<h1 align="center" id="space-title">
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
-
|
29 |
"""
|
30 |
|
31 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
task0 = Task("acrostic", "EM", "Acrostic")
|
16 |
+
task1 = Task("crossword", "EM", "Crossword")
|
17 |
+
task2 = Task("cryptogram", "EM", "Cryptogram")
|
18 |
+
task3 = Task("logic_puzzle", "EM", "Logic Puzzle")
|
19 |
+
task4 = Task("sudoku", "EM", "Sudoku")
|
20 |
+
task5 = Task("drop_quote", "EM", "Drop Quote")
|
21 |
|
22 |
NUM_FEWSHOT = 0 # Change with your few shot
|
23 |
# ---------------------------------------------------
|
|
|
25 |
|
26 |
|
27 |
# Your leaderboard name
|
28 |
+
TITLE = """<h1 align="center" id="space-title">LR<sup>2</sup>Bench: Evaluating Long-chain Reflective Reasoning Capabilities of Large Language Models via Constraint Satisfaction Problems</h1>"""
|
29 |
|
30 |
# What does your leaderboard evaluate?
|
31 |
INTRODUCTION_TEXT = """
|
32 |
+
<strong>LR<sup>2</sup>Bench</strong> is a novel benchmark designed to evaluate the <strong>L</strong>ong-chain <strong>R</strong>eflective <strong>R</strong>easoning capabilities of LLMs. LR<sup>2</sup>Bench comprises 850 samples across six Constraint Satisfaction Problems (CSPs) where reflective reasoning is crucial for deriving solutions that meet all given constraints. Each type of task focuses on distinct constraint patterns, such as knowledge-based, logical, and spatial constraints, providing a comprehensive evaluation of diverse problem-solving scenarios.
|
33 |
"""
|
34 |
|
35 |
# Which evaluations are you running? how can people reproduce what you have?
|
src/display/formatting.py
CHANGED
@@ -2,9 +2,13 @@ def model_hyperlink(link, model_name):
|
|
2 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
3 |
|
4 |
|
5 |
-
def make_clickable_model(model_name):
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
8 |
|
9 |
|
10 |
def styled_error(error):
|
|
|
2 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
3 |
|
4 |
|
5 |
+
def make_clickable_model(model_name, still_on_hub=False):
|
6 |
+
if still_on_hub:
|
7 |
+
link = f"https://huggingface.co/{model_name}"
|
8 |
+
return model_hyperlink(link, model_name)
|
9 |
+
else:
|
10 |
+
return f'<span>{model_name}</span>'
|
11 |
+
|
12 |
|
13 |
|
14 |
def styled_error(error):
|
src/display/utils.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from dataclasses import dataclass, make_dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
import pandas as pd
|
@@ -23,22 +23,27 @@ class ColumnContent:
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
-
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
|
|
28 |
#Scores
|
29 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("
|
|
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
|
|
32 |
# Model information
|
33 |
-
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
-
auto_eval_column_dict.append(["
|
35 |
-
auto_eval_column_dict.append(["
|
36 |
-
auto_eval_column_dict.append(["
|
37 |
-
auto_eval_column_dict.append(["
|
|
|
38 |
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
39 |
-
auto_eval_column_dict.append(["
|
40 |
-
auto_eval_column_dict.append(["
|
41 |
-
auto_eval_column_dict.append(["
|
|
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
@@ -101,6 +106,7 @@ class Precision(Enum):
|
|
101 |
return Precision.Unknown
|
102 |
|
103 |
# Column selection
|
|
|
104 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
105 |
|
106 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
|
|
1 |
+
from dataclasses import dataclass, make_dataclass, field
|
2 |
from enum import Enum
|
3 |
|
4 |
import pandas as pd
|
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
+
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
+
# auto_eval_column_dict.append(["model", ColumnContent, field(default_factory=lambda: ColumnContent("Model", "markdown", True, never_hidden=True))])
|
29 |
#Scores
|
30 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("EM ⬆️", "number", True)])
|
31 |
+
# auto_eval_column_dict.append(["average", ColumnContent, field(default_factory=lambda: ColumnContent("Average ⬆️", "number", True))])
|
32 |
for task in Tasks:
|
33 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
34 |
+
# auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda task=task: ColumnContent(task.value.col_name, "number", True))])
|
35 |
# Model information
|
36 |
+
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
37 |
+
# auto_eval_column_dict.append(["model_type", ColumnContent, field(default_factory=lambda: ColumnContent("Type", "str", False))])
|
38 |
+
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
39 |
+
# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
40 |
+
# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
41 |
+
# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
42 |
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
43 |
+
# auto_eval_column_dict.append(["params", ColumnContent, field(default_factory=lambda: ColumnContent("#Params (B)", "number", False))])
|
44 |
+
# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
45 |
+
# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
46 |
+
# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
47 |
|
48 |
# We use make dataclass to dynamically fill the scores from Tasks
|
49 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
106 |
return Precision.Unknown
|
107 |
|
108 |
# Column selection
|
109 |
+
# import pdb; pdb.set_trace()
|
110 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
111 |
|
112 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
src/envs.py
CHANGED
@@ -6,20 +6,22 @@ from huggingface_hub import HfApi
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
-
OWNER = "
|
10 |
# ----------------------------------
|
11 |
|
12 |
REPO_ID = f"{OWNER}/leaderboard"
|
13 |
QUEUE_REPO = f"{OWNER}/requests"
|
14 |
-
RESULTS_REPO = f"{OWNER}/
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
18 |
|
19 |
# Local caches
|
20 |
-
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
24 |
|
25 |
API = HfApi(token=TOKEN)
|
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
+
OWNER = "UltraRonin" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
# ----------------------------------
|
11 |
|
12 |
REPO_ID = f"{OWNER}/leaderboard"
|
13 |
QUEUE_REPO = f"{OWNER}/requests"
|
14 |
+
RESULTS_REPO = f"{OWNER}/LR2Bench"
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
18 |
|
19 |
# Local caches
|
20 |
+
# EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
21 |
+
EVAL_REQUESTS_PATH = "/data_jhchen/Leaderboard/Requests"
|
22 |
+
# EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
23 |
+
EVAL_RESULTS_PATH = "/data_jhchen/Leaderboard/LR2Bench"
|
24 |
+
# EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
25 |
+
# EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
26 |
|
27 |
API = HfApi(token=TOKEN)
|
src/leaderboard/read_evals.py
CHANGED
@@ -20,16 +20,16 @@ class EvalResult:
|
|
20 |
full_model: str # org/model (path on hub)
|
21 |
org: str
|
22 |
model: str
|
23 |
-
revision: str # commit hash, "" if main
|
24 |
results: dict
|
25 |
-
precision: Precision = Precision.Unknown
|
26 |
-
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
27 |
-
weight_type: WeightType = WeightType.Original # Original or Adapter
|
28 |
-
architecture: str = "Unknown"
|
29 |
-
license: str = "?"
|
30 |
-
likes: int = 0
|
31 |
num_params: int = 0
|
32 |
-
date: str = "" # submission date of request file
|
33 |
still_on_hub: bool = False
|
34 |
|
35 |
@classmethod
|
@@ -41,7 +41,7 @@ class EvalResult:
|
|
41 |
config = data.get("config")
|
42 |
|
43 |
# Precision
|
44 |
-
precision = Precision.from_str(config.get("model_dtype"))
|
45 |
|
46 |
# Get model and org
|
47 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
@@ -50,21 +50,23 @@ class EvalResult:
|
|
50 |
if len(org_and_model) == 1:
|
51 |
org = None
|
52 |
model = org_and_model[0]
|
53 |
-
result_key = f"{model}_{precision.value.name}"
|
|
|
54 |
else:
|
55 |
org = org_and_model[0]
|
56 |
model = org_and_model[1]
|
57 |
-
result_key = f"{org}_{model}_{precision.value.name}"
|
|
|
58 |
full_model = "/".join(org_and_model)
|
59 |
|
60 |
still_on_hub, _, model_config = is_model_on_hub(
|
61 |
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
62 |
)
|
63 |
-
architecture = "?"
|
64 |
-
if model_config is not None:
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
|
69 |
# Extract results available in this file (some results are split in several files)
|
70 |
results = {}
|
@@ -72,11 +74,11 @@ class EvalResult:
|
|
72 |
task = task.value
|
73 |
|
74 |
# We average all scores of a given metric (not all metrics are present in all files)
|
75 |
-
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
76 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
continue
|
78 |
|
79 |
-
mean_acc = np.mean(accs)
|
80 |
results[task.benchmark] = mean_acc
|
81 |
|
82 |
return self(
|
@@ -85,45 +87,47 @@ class EvalResult:
|
|
85 |
org=org,
|
86 |
model=model,
|
87 |
results=results,
|
88 |
-
precision=precision,
|
89 |
-
revision= config.get("model_sha", ""),
|
90 |
still_on_hub=still_on_hub,
|
91 |
-
architecture=architecture
|
92 |
)
|
93 |
|
94 |
-
def update_with_request_file(self, requests_path):
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
|
|
|
|
109 |
|
110 |
def to_dict(self):
|
111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
112 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
113 |
data_dict = {
|
114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
-
AutoEvalColumn.precision.name: self.precision.value.name,
|
116 |
-
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
117 |
-
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
118 |
-
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
119 |
-
AutoEvalColumn.architecture.name: self.architecture,
|
120 |
-
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
121 |
-
AutoEvalColumn.revision.name: self.revision,
|
122 |
AutoEvalColumn.average.name: average,
|
123 |
-
AutoEvalColumn.license.name: self.license,
|
124 |
-
AutoEvalColumn.likes.name: self.likes,
|
125 |
AutoEvalColumn.params.name: self.num_params,
|
126 |
-
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
127 |
}
|
128 |
|
129 |
for task in Tasks:
|
@@ -132,26 +136,27 @@ class EvalResult:
|
|
132 |
return data_dict
|
133 |
|
134 |
|
135 |
-
def get_request_file_for_model(requests_path, model_name, precision):
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
|
|
155 |
|
156 |
|
157 |
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
@@ -176,7 +181,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
176 |
for model_result_filepath in model_result_filepaths:
|
177 |
# Creation of result
|
178 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
179 |
-
eval_result.update_with_request_file(requests_path)
|
180 |
|
181 |
# Store results of same eval together
|
182 |
eval_name = eval_result.eval_name
|
|
|
20 |
full_model: str # org/model (path on hub)
|
21 |
org: str
|
22 |
model: str
|
23 |
+
# revision: str # commit hash, "" if main
|
24 |
results: dict
|
25 |
+
# precision: Precision = Precision.Unknown
|
26 |
+
# model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
27 |
+
# weight_type: WeightType = WeightType.Original # Original or Adapter
|
28 |
+
# architecture: str = "Unknown"
|
29 |
+
# license: str = "?"
|
30 |
+
# likes: int = 0
|
31 |
num_params: int = 0
|
32 |
+
# date: str = "" # submission date of request file
|
33 |
still_on_hub: bool = False
|
34 |
|
35 |
@classmethod
|
|
|
41 |
config = data.get("config")
|
42 |
|
43 |
# Precision
|
44 |
+
# precision = Precision.from_str(config.get("model_dtype"))
|
45 |
|
46 |
# Get model and org
|
47 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
|
|
50 |
if len(org_and_model) == 1:
|
51 |
org = None
|
52 |
model = org_and_model[0]
|
53 |
+
# result_key = f"{model}_{precision.value.name}"
|
54 |
+
result_key = f"{model}"
|
55 |
else:
|
56 |
org = org_and_model[0]
|
57 |
model = org_and_model[1]
|
58 |
+
# result_key = f"{org}_{model}_{precision.value.name}"
|
59 |
+
result_key = f"{org}_{model}"
|
60 |
full_model = "/".join(org_and_model)
|
61 |
|
62 |
still_on_hub, _, model_config = is_model_on_hub(
|
63 |
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
64 |
)
|
65 |
+
# architecture = "?"
|
66 |
+
# if model_config is not None:
|
67 |
+
# architectures = getattr(model_config, "architectures", None)
|
68 |
+
# if architectures:
|
69 |
+
# architecture = ";".join(architectures)
|
70 |
|
71 |
# Extract results available in this file (some results are split in several files)
|
72 |
results = {}
|
|
|
74 |
task = task.value
|
75 |
|
76 |
# We average all scores of a given metric (not all metrics are present in all files)
|
77 |
+
accs = np.array([float(v.get(task.metric, None)) for k, v in data["results"].items() if task.benchmark == k.lower()])
|
78 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
79 |
continue
|
80 |
|
81 |
+
mean_acc = np.mean(accs)
|
82 |
results[task.benchmark] = mean_acc
|
83 |
|
84 |
return self(
|
|
|
87 |
org=org,
|
88 |
model=model,
|
89 |
results=results,
|
90 |
+
# precision=precision,
|
91 |
+
# revision= config.get("model_sha", ""),
|
92 |
still_on_hub=still_on_hub,
|
93 |
+
# architecture=architecture
|
94 |
)
|
95 |
|
96 |
+
# def update_with_request_file(self, requests_path):
|
97 |
+
# """Finds the relevant request file for the current model and updates info with it"""
|
98 |
+
# # request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
99 |
+
# request_file = get_request_file_for_model(requests_path, self.full_model)
|
100 |
+
|
101 |
+
# try:
|
102 |
+
# with open(request_file, "r") as f:
|
103 |
+
# request = json.load(f)
|
104 |
+
# # self.model_type = ModelType.from_str(request.get("model_type", ""))
|
105 |
+
# # self.weight_type = WeightType[request.get("weight_type", "Original")]
|
106 |
+
# # self.license = request.get("license", "?")
|
107 |
+
# # self.likes = request.get("likes", 0)
|
108 |
+
# self.num_params = request.get("params", 0)
|
109 |
+
# # self.date = request.get("submitted_time", "")
|
110 |
+
# except Exception:
|
111 |
+
# # print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
112 |
+
# print(f"Could not find request file for {self.org}/{self.model}")
|
113 |
|
114 |
def to_dict(self):
|
115 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
116 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
117 |
data_dict = {
|
118 |
"eval_name": self.eval_name, # not a column, just a save name,
|
119 |
+
# AutoEvalColumn.precision.name: self.precision.value.name,
|
120 |
+
# AutoEvalColumn.model_type.name: self.model_type.value.name,
|
121 |
+
# AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
122 |
+
# AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
123 |
+
# AutoEvalColumn.architecture.name: self.architecture,
|
124 |
+
AutoEvalColumn.model.name: make_clickable_model(self.full_model, self.still_on_hub),
|
125 |
+
# AutoEvalColumn.revision.name: self.revision,
|
126 |
AutoEvalColumn.average.name: average,
|
127 |
+
# AutoEvalColumn.license.name: self.license,
|
128 |
+
# AutoEvalColumn.likes.name: self.likes,
|
129 |
AutoEvalColumn.params.name: self.num_params,
|
130 |
+
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
131 |
}
|
132 |
|
133 |
for task in Tasks:
|
|
|
136 |
return data_dict
|
137 |
|
138 |
|
139 |
+
# def get_request_file_for_model(requests_path, model_name, precision):
|
140 |
+
# def get_request_file_for_model(requests_path, model_name):
|
141 |
+
# """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
142 |
+
# request_files = os.path.join(
|
143 |
+
# requests_path,
|
144 |
+
# f"{model_name}_eval_request_*.json",
|
145 |
+
# )
|
146 |
+
# request_files = glob.glob(request_files)
|
147 |
+
|
148 |
+
# # Select correct request file (precision)
|
149 |
+
# request_file = ""
|
150 |
+
# request_files = sorted(request_files, reverse=True)
|
151 |
+
# for tmp_request_file in request_files:
|
152 |
+
# with open(tmp_request_file, "r") as f:
|
153 |
+
# req_content = json.load(f)
|
154 |
+
# if (
|
155 |
+
# req_content["status"] in ["FINISHED"]
|
156 |
+
# # and req_content["precision"] == precision.split(".")[-1]
|
157 |
+
# ):
|
158 |
+
# request_file = tmp_request_file
|
159 |
+
# return request_file
|
160 |
|
161 |
|
162 |
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
|
|
181 |
for model_result_filepath in model_result_filepaths:
|
182 |
# Creation of result
|
183 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
184 |
+
# eval_result.update_with_request_file(requests_path)
|
185 |
|
186 |
# Store results of same eval together
|
187 |
eval_name = eval_result.eval_name
|
src/populate.py
CHANGED
@@ -34,7 +34,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
34 |
data = json.load(fp)
|
35 |
|
36 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
37 |
-
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
38 |
|
39 |
all_evals.append(data)
|
40 |
elif ".md" not in entry:
|
@@ -46,7 +46,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
46 |
data = json.load(fp)
|
47 |
|
48 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
49 |
-
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
50 |
all_evals.append(data)
|
51 |
|
52 |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
|
|
34 |
data = json.load(fp)
|
35 |
|
36 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
37 |
+
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
38 |
|
39 |
all_evals.append(data)
|
40 |
elif ".md" not in entry:
|
|
|
46 |
data = json.load(fp)
|
47 |
|
48 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
49 |
+
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
50 |
all_evals.append(data)
|
51 |
|
52 |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|