Spaces:
Running
Running
Carol-gutianle
commited on
Commit
·
77f128a
1
Parent(s):
c8d20dc
init
Browse files- .gitignore +5 -4
- app.py +124 -160
- eval-queue/.gitattributes +55 -0
- eval-queue/gpt-4v/results.json +1 -0
- eval-queue/internvl/results.json +1 -0
- eval-results/.gitattributes +55 -0
- eval-results/gpt-4v/results.json +24 -0
- eval-results/internvl/results.json +24 -0
- src/about.py +15 -4
- src/display/utils.py +16 -16
- src/leaderboard/read_evals.py +14 -14
- src/populate.py +9 -9
.gitignore
CHANGED
@@ -5,9 +5,10 @@ __pycache__/
|
|
5 |
.ipynb_checkpoints
|
6 |
*ipynb
|
7 |
.vscode/
|
|
|
8 |
|
9 |
-
eval-queue/
|
10 |
-
eval-results/
|
11 |
-
eval-queue-bk/
|
12 |
-
eval-results-bk/
|
13 |
logs/
|
|
|
5 |
.ipynb_checkpoints
|
6 |
*ipynb
|
7 |
.vscode/
|
8 |
+
.huggingface
|
9 |
|
10 |
+
# eval-queue/
|
11 |
+
# eval-results/
|
12 |
+
# eval-queue-bk/
|
13 |
+
# eval-results-bk/
|
14 |
logs/
|
app.py
CHANGED
@@ -34,29 +34,13 @@ from src.submission.submit import add_new_eval
|
|
34 |
def restart_space():
|
35 |
API.restart_space(repo_id=REPO_ID)
|
36 |
|
37 |
-
try:
|
38 |
-
print(EVAL_REQUESTS_PATH)
|
39 |
-
snapshot_download(
|
40 |
-
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
41 |
-
)
|
42 |
-
except Exception:
|
43 |
-
restart_space()
|
44 |
-
try:
|
45 |
-
print(EVAL_RESULTS_PATH)
|
46 |
-
snapshot_download(
|
47 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
48 |
-
)
|
49 |
-
except Exception:
|
50 |
-
restart_space()
|
51 |
-
|
52 |
-
|
53 |
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
54 |
leaderboard_df = original_df.copy()
|
55 |
|
56 |
(
|
57 |
finished_eval_queue_df,
|
58 |
-
running_eval_queue_df,
|
59 |
-
pending_eval_queue_df,
|
60 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
61 |
|
62 |
|
@@ -64,13 +48,10 @@ leaderboard_df = original_df.copy()
|
|
64 |
def update_table(
|
65 |
hidden_df: pd.DataFrame,
|
66 |
columns: list,
|
67 |
-
type_query: list,
|
68 |
-
precision_query: str,
|
69 |
-
size_query: list,
|
70 |
show_deleted: bool,
|
71 |
query: str,
|
72 |
):
|
73 |
-
filtered_df = filter_models(hidden_df
|
74 |
filtered_df = filter_queries(query, filtered_df)
|
75 |
df = select_columns(filtered_df, columns)
|
76 |
return df
|
@@ -82,7 +63,7 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
|
82 |
|
83 |
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
84 |
always_here_cols = [
|
85 |
-
AutoEvalColumn.model_type_symbol.name,
|
86 |
AutoEvalColumn.model.name,
|
87 |
]
|
88 |
# We use COLS to maintain sorting
|
@@ -105,30 +86,18 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
|
105 |
if len(final_df) > 0:
|
106 |
filtered_df = pd.concat(final_df)
|
107 |
filtered_df = filtered_df.drop_duplicates(
|
108 |
-
subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
|
|
|
109 |
)
|
110 |
|
111 |
return filtered_df
|
112 |
|
113 |
|
114 |
def filter_models(
|
115 |
-
df: pd.DataFrame
|
116 |
) -> pd.DataFrame:
|
117 |
# Show all models
|
118 |
-
|
119 |
-
filtered_df = df
|
120 |
-
else: # Show only still on the hub models
|
121 |
-
filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
|
122 |
-
|
123 |
-
type_emoji = [t[0] for t in type_query]
|
124 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
125 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
126 |
-
|
127 |
-
numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
128 |
-
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
129 |
-
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
130 |
-
filtered_df = filtered_df.loc[mask]
|
131 |
-
|
132 |
return filtered_df
|
133 |
|
134 |
|
@@ -138,7 +107,7 @@ with demo:
|
|
138 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
139 |
|
140 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
141 |
-
with gr.TabItem("🏅
|
142 |
with gr.Row():
|
143 |
with gr.Column():
|
144 |
with gr.Row():
|
@@ -167,29 +136,29 @@ with demo:
|
|
167 |
deleted_models_visibility = gr.Checkbox(
|
168 |
value=False, label="Show gated/private/deleted models", interactive=True
|
169 |
)
|
170 |
-
with gr.Column(min_width=320):
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
|
194 |
leaderboard_table = gr.components.Dataframe(
|
195 |
value=leaderboard_df[
|
@@ -215,23 +184,17 @@ with demo:
|
|
215 |
[
|
216 |
hidden_leaderboard_table_for_search,
|
217 |
shown_columns,
|
218 |
-
filter_columns_type,
|
219 |
-
filter_columns_precision,
|
220 |
-
filter_columns_size,
|
221 |
deleted_models_visibility,
|
222 |
search_bar,
|
223 |
],
|
224 |
leaderboard_table,
|
225 |
)
|
226 |
-
for selector in [shown_columns,
|
227 |
selector.change(
|
228 |
update_table,
|
229 |
[
|
230 |
hidden_leaderboard_table_for_search,
|
231 |
shown_columns,
|
232 |
-
filter_columns_type,
|
233 |
-
filter_columns_precision,
|
234 |
-
filter_columns_size,
|
235 |
deleted_models_visibility,
|
236 |
search_bar,
|
237 |
],
|
@@ -239,95 +202,95 @@ with demo:
|
|
239 |
queue=True,
|
240 |
)
|
241 |
|
242 |
-
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
243 |
-
|
244 |
-
|
245 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
246 |
-
with gr.Column():
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
with gr.Row():
|
286 |
-
|
287 |
-
|
288 |
-
with gr.Row():
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
submit_button = gr.Button("Submit Eval")
|
318 |
-
submission_result = gr.Markdown()
|
319 |
-
submit_button.click(
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
)
|
331 |
|
332 |
with gr.Row():
|
333 |
with gr.Accordion("📙 Citation", open=False):
|
@@ -342,4 +305,5 @@ with demo:
|
|
342 |
scheduler = BackgroundScheduler()
|
343 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
344 |
scheduler.start()
|
345 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
34 |
def restart_space():
|
35 |
API.restart_space(repo_id=REPO_ID)
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
38 |
leaderboard_df = original_df.copy()
|
39 |
|
40 |
(
|
41 |
finished_eval_queue_df,
|
42 |
+
# running_eval_queue_df,
|
43 |
+
# pending_eval_queue_df,
|
44 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
45 |
|
46 |
|
|
|
48 |
def update_table(
|
49 |
hidden_df: pd.DataFrame,
|
50 |
columns: list,
|
|
|
|
|
|
|
51 |
show_deleted: bool,
|
52 |
query: str,
|
53 |
):
|
54 |
+
filtered_df = filter_models(hidden_df)
|
55 |
filtered_df = filter_queries(query, filtered_df)
|
56 |
df = select_columns(filtered_df, columns)
|
57 |
return df
|
|
|
63 |
|
64 |
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
65 |
always_here_cols = [
|
66 |
+
# AutoEvalColumn.model_type_symbol.name,
|
67 |
AutoEvalColumn.model.name,
|
68 |
]
|
69 |
# We use COLS to maintain sorting
|
|
|
86 |
if len(final_df) > 0:
|
87 |
filtered_df = pd.concat(final_df)
|
88 |
filtered_df = filtered_df.drop_duplicates(
|
89 |
+
# subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
|
90 |
+
subset=[AutoEvalColumn.model.name]
|
91 |
)
|
92 |
|
93 |
return filtered_df
|
94 |
|
95 |
|
96 |
def filter_models(
|
97 |
+
df: pd.DataFrame
|
98 |
) -> pd.DataFrame:
|
99 |
# Show all models
|
100 |
+
filtered_df = df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
return filtered_df
|
102 |
|
103 |
|
|
|
107 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
108 |
|
109 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
110 |
+
with gr.TabItem("🏅 MLLMGuard(ASD)", elem_id="llm-benchmark-tab-table", id=0):
|
111 |
with gr.Row():
|
112 |
with gr.Column():
|
113 |
with gr.Row():
|
|
|
136 |
deleted_models_visibility = gr.Checkbox(
|
137 |
value=False, label="Show gated/private/deleted models", interactive=True
|
138 |
)
|
139 |
+
# with gr.Column(min_width=320):
|
140 |
+
# #with gr.Box(elem_id="box-filter"):
|
141 |
+
# filter_columns_type = gr.CheckboxGroup(
|
142 |
+
# label="Model types",
|
143 |
+
# choices=[t.to_str() for t in ModelType],
|
144 |
+
# value=[t.to_str() for t in ModelType],
|
145 |
+
# interactive=True,
|
146 |
+
# elem_id="filter-columns-type",
|
147 |
+
# )
|
148 |
+
# filter_columns_precision = gr.CheckboxGroup(
|
149 |
+
# label="Precision",
|
150 |
+
# choices=[i.value.name for i in Precision],
|
151 |
+
# value=[i.value.name for i in Precision],
|
152 |
+
# interactive=True,
|
153 |
+
# elem_id="filter-columns-precision",
|
154 |
+
# )
|
155 |
+
# filter_columns_size = gr.CheckboxGroup(
|
156 |
+
# label="Model sizes (in billions of parameters)",
|
157 |
+
# choices=list(NUMERIC_INTERVALS.keys()),
|
158 |
+
# value=list(NUMERIC_INTERVALS.keys()),
|
159 |
+
# interactive=True,
|
160 |
+
# elem_id="filter-columns-size",
|
161 |
+
# )
|
162 |
|
163 |
leaderboard_table = gr.components.Dataframe(
|
164 |
value=leaderboard_df[
|
|
|
184 |
[
|
185 |
hidden_leaderboard_table_for_search,
|
186 |
shown_columns,
|
|
|
|
|
|
|
187 |
deleted_models_visibility,
|
188 |
search_bar,
|
189 |
],
|
190 |
leaderboard_table,
|
191 |
)
|
192 |
+
for selector in [shown_columns, deleted_models_visibility]:
|
193 |
selector.change(
|
194 |
update_table,
|
195 |
[
|
196 |
hidden_leaderboard_table_for_search,
|
197 |
shown_columns,
|
|
|
|
|
|
|
198 |
deleted_models_visibility,
|
199 |
search_bar,
|
200 |
],
|
|
|
202 |
queue=True,
|
203 |
)
|
204 |
|
205 |
+
# with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
206 |
+
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
207 |
+
|
208 |
+
# with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
209 |
+
# with gr.Column():
|
210 |
+
# with gr.Row():
|
211 |
+
# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
212 |
+
|
213 |
+
# with gr.Column():
|
214 |
+
# with gr.Accordion(
|
215 |
+
# f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
216 |
+
# open=False,
|
217 |
+
# ):
|
218 |
+
# with gr.Row():
|
219 |
+
# finished_eval_table = gr.components.Dataframe(
|
220 |
+
# value=finished_eval_queue_df,
|
221 |
+
# headers=EVAL_COLS,
|
222 |
+
# datatype=EVAL_TYPES,
|
223 |
+
# row_count=5,
|
224 |
+
# )
|
225 |
+
# with gr.Accordion(
|
226 |
+
# f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
227 |
+
# open=False,
|
228 |
+
# ):
|
229 |
+
# with gr.Row():
|
230 |
+
# running_eval_table = gr.components.Dataframe(
|
231 |
+
# value=running_eval_queue_df,
|
232 |
+
# headers=EVAL_COLS,
|
233 |
+
# datatype=EVAL_TYPES,
|
234 |
+
# row_count=5,
|
235 |
+
# )
|
236 |
+
|
237 |
+
# with gr.Accordion(
|
238 |
+
# f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
239 |
+
# open=False,
|
240 |
+
# ):
|
241 |
+
# with gr.Row():
|
242 |
+
# pending_eval_table = gr.components.Dataframe(
|
243 |
+
# value=pending_eval_queue_df,
|
244 |
+
# headers=EVAL_COLS,
|
245 |
+
# datatype=EVAL_TYPES,
|
246 |
+
# row_count=5,
|
247 |
+
# )
|
248 |
+
# with gr.Row():
|
249 |
+
# gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
250 |
+
|
251 |
+
# with gr.Row():
|
252 |
+
# with gr.Column():
|
253 |
+
# model_name_textbox = gr.Textbox(label="Model name")
|
254 |
+
# revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
255 |
+
# model_type = gr.Dropdown(
|
256 |
+
# choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
257 |
+
# label="Model type",
|
258 |
+
# multiselect=False,
|
259 |
+
# value=None,
|
260 |
+
# interactive=True,
|
261 |
+
# )
|
262 |
+
|
263 |
+
# with gr.Column():
|
264 |
+
# precision = gr.Dropdown(
|
265 |
+
# choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
266 |
+
# label="Precision",
|
267 |
+
# multiselect=False,
|
268 |
+
# value="float16",
|
269 |
+
# interactive=True,
|
270 |
+
# )
|
271 |
+
# weight_type = gr.Dropdown(
|
272 |
+
# choices=[i.value.name for i in WeightType],
|
273 |
+
# label="Weights type",
|
274 |
+
# multiselect=False,
|
275 |
+
# value="Original",
|
276 |
+
# interactive=True,
|
277 |
+
# )
|
278 |
+
# base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
279 |
+
|
280 |
+
# submit_button = gr.Button("Submit Eval")
|
281 |
+
# submission_result = gr.Markdown()
|
282 |
+
# submit_button.click(
|
283 |
+
# add_new_eval,
|
284 |
+
# [
|
285 |
+
# model_name_textbox,
|
286 |
+
# base_model_name_textbox,
|
287 |
+
# revision_name_textbox,
|
288 |
+
# precision,
|
289 |
+
# weight_type,
|
290 |
+
# model_type,
|
291 |
+
# ],
|
292 |
+
# submission_result,
|
293 |
+
# )
|
294 |
|
295 |
with gr.Row():
|
296 |
with gr.Accordion("📙 Citation", open=False):
|
|
|
305 |
scheduler = BackgroundScheduler()
|
306 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
307 |
scheduler.start()
|
308 |
+
demo.queue(default_concurrency_limit=40).launch()
|
309 |
+
# demo.launch(server_name="127.0.0.1", server_port=7855, debug=True)
|
eval-queue/.gitattributes
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.lz4 filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
27 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
37 |
+
# Audio files - uncompressed
|
38 |
+
*.pcm filter=lfs diff=lfs merge=lfs -text
|
39 |
+
*.sam filter=lfs diff=lfs merge=lfs -text
|
40 |
+
*.raw filter=lfs diff=lfs merge=lfs -text
|
41 |
+
# Audio files - compressed
|
42 |
+
*.aac filter=lfs diff=lfs merge=lfs -text
|
43 |
+
*.flac filter=lfs diff=lfs merge=lfs -text
|
44 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
45 |
+
*.ogg filter=lfs diff=lfs merge=lfs -text
|
46 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
47 |
+
# Image files - uncompressed
|
48 |
+
*.bmp filter=lfs diff=lfs merge=lfs -text
|
49 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
50 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
51 |
+
*.tiff filter=lfs diff=lfs merge=lfs -text
|
52 |
+
# Image files - compressed
|
53 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
54 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
55 |
+
*.webp filter=lfs diff=lfs merge=lfs -text
|
eval-queue/gpt-4v/results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model": "gpt-4v"}
|
eval-queue/internvl/results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model": "internvl", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-21T18:10:08Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 0, "params": 0.1, "license": "custom"}
|
eval-results/.gitattributes
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.lz4 filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
27 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
37 |
+
# Audio files - uncompressed
|
38 |
+
*.pcm filter=lfs diff=lfs merge=lfs -text
|
39 |
+
*.sam filter=lfs diff=lfs merge=lfs -text
|
40 |
+
*.raw filter=lfs diff=lfs merge=lfs -text
|
41 |
+
# Audio files - compressed
|
42 |
+
*.aac filter=lfs diff=lfs merge=lfs -text
|
43 |
+
*.flac filter=lfs diff=lfs merge=lfs -text
|
44 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
45 |
+
*.ogg filter=lfs diff=lfs merge=lfs -text
|
46 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
47 |
+
# Image files - uncompressed
|
48 |
+
*.bmp filter=lfs diff=lfs merge=lfs -text
|
49 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
50 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
51 |
+
*.tiff filter=lfs diff=lfs merge=lfs -text
|
52 |
+
# Image files - compressed
|
53 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
54 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
55 |
+
*.webp filter=lfs diff=lfs merge=lfs -text
|
eval-results/gpt-4v/results.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config": {
|
3 |
+
"model_dtype": "torch.bfloat16",
|
4 |
+
"model_name": "gpt-4v",
|
5 |
+
"model_sha": "ac3299b02780836378b9e1e68c6eead546e89f90"
|
6 |
+
},
|
7 |
+
"results": {
|
8 |
+
"asd_privacy": {
|
9 |
+
"asd": 0.2500
|
10 |
+
},
|
11 |
+
"asd_bias": {
|
12 |
+
"asd": 0.1944
|
13 |
+
},
|
14 |
+
"asd_toxicity": {
|
15 |
+
"asd": 0.3247
|
16 |
+
},
|
17 |
+
"asd_truthfulness": {
|
18 |
+
"asd": 0.2115
|
19 |
+
},
|
20 |
+
"asd_legality": {
|
21 |
+
"asd": 0.2542
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
eval-results/internvl/results.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config": {
|
3 |
+
"model_dtype": "torch.bfloat16",
|
4 |
+
"model_name": "internvl",
|
5 |
+
"model_sha": "ac3299b02780836378b9e1e68c6eead546e89f90"
|
6 |
+
},
|
7 |
+
"results": {
|
8 |
+
"asd_privacy": {
|
9 |
+
"asd": 0.3657
|
10 |
+
},
|
11 |
+
"asd_bias": {
|
12 |
+
"asd": 0.3129
|
13 |
+
},
|
14 |
+
"asd_toxicity": {
|
15 |
+
"asd": 0.3285
|
16 |
+
},
|
17 |
+
"asd_truthfulness": {
|
18 |
+
"asd": 0.2050
|
19 |
+
},
|
20 |
+
"asd_legality": {
|
21 |
+
"asd": 0.3278
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
src/about.py
CHANGED
@@ -12,8 +12,11 @@ class Task:
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("
|
16 |
-
task1 = Task("
|
|
|
|
|
|
|
17 |
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
@@ -21,11 +24,11 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
24 |
-
TITLE = """<h1 align="center" id="space-title">
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
-
|
29 |
"""
|
30 |
|
31 |
# Which evaluations are you running? how can people reproduce what you have?
|
@@ -69,4 +72,12 @@ If everything is done, check you can launch the EleutherAIHarness on your model
|
|
69 |
|
70 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
71 |
CITATION_BUTTON_TEXT = r"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
"""
|
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
task0 = Task("asd_privacy", "asd", "Privacy")
|
16 |
+
task1 = Task("asd_bias", "asd", "Bias")
|
17 |
+
task2 = Task("asd_toxicity", "asd", "Toxicity")
|
18 |
+
task3 = Task("asd_truthfulness", "asd", "Truthfulness")
|
19 |
+
task4 = Task("asd_legality", "asd", "Legality")
|
20 |
|
21 |
NUM_FEWSHOT = 0 # Change with your few shot
|
22 |
# ---------------------------------------------------
|
|
|
24 |
|
25 |
|
26 |
# Your leaderboard name
|
27 |
+
TITLE = """<h1 align="center" id="space-title">MLLMGuard Leaderboard</h1>"""
|
28 |
|
29 |
# What does your leaderboard evaluate?
|
30 |
INTRODUCTION_TEXT = """
|
31 |
+
MLLMGuard is a multi-dimensional safety evaluation suite for MLLMs, including a bilingual image-text evaluation dataset, inference utilities, and a set of lightweight evaluators.
|
32 |
"""
|
33 |
|
34 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
|
72 |
|
73 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
74 |
CITATION_BUTTON_TEXT = r"""
|
75 |
+
@misc{gu2024mllmguard,
|
76 |
+
title={MLLMGuard: A Multi-dimensional Safety Evaluation Suite for Multimodal Large Language Models},
|
77 |
+
author={Tianle Gu and Zeyang Zhou and Kexin Huang and Dandan Liang and Yixu Wang and Haiquan Zhao and Yuanqi Yao and Xingge Qiao and Keqing Wang and Yujiu Yang and Yan Teng and Yu Qiao and Yingchun Wang},
|
78 |
+
year={2024},
|
79 |
+
eprint={2406.07594},
|
80 |
+
archivePrefix={arXiv},
|
81 |
+
primaryClass={cs.CR}
|
82 |
+
}
|
83 |
"""
|
src/display/utils.py
CHANGED
@@ -23,22 +23,22 @@ class ColumnContent:
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
-
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
#Scores
|
29 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
# Model information
|
33 |
-
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
-
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
35 |
-
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
36 |
-
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
37 |
-
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
38 |
-
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
39 |
-
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
40 |
-
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
41 |
-
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
@@ -47,11 +47,11 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
|
|
47 |
@dataclass(frozen=True)
|
48 |
class EvalQueueColumn: # Queue column
|
49 |
model = ColumnContent("model", "markdown", True)
|
50 |
-
revision = ColumnContent("revision", "str", True)
|
51 |
-
private = ColumnContent("private", "bool", True)
|
52 |
-
precision = ColumnContent("precision", "str", True)
|
53 |
-
weight_type = ColumnContent("weight_type", "str", "Original")
|
54 |
-
status = ColumnContent("status", "str", True)
|
55 |
|
56 |
## All the model information that we might need
|
57 |
@dataclass
|
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
+
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=False)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
#Scores
|
29 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("ASD ⬇️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
# Model information
|
33 |
+
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
+
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
35 |
+
# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
36 |
+
# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
37 |
+
# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
38 |
+
# auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
39 |
+
# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
40 |
+
# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
41 |
+
# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
47 |
@dataclass(frozen=True)
|
48 |
class EvalQueueColumn: # Queue column
|
49 |
model = ColumnContent("model", "markdown", True)
|
50 |
+
# revision = ColumnContent("revision", "str", True)
|
51 |
+
# private = ColumnContent("private", "bool", True)
|
52 |
+
# precision = ColumnContent("precision", "str", True)
|
53 |
+
# weight_type = ColumnContent("weight_type", "str", "Original")
|
54 |
+
# status = ColumnContent("status", "str", True)
|
55 |
|
56 |
## All the model information that we might need
|
57 |
@dataclass
|
src/leaderboard/read_evals.py
CHANGED
@@ -112,18 +112,18 @@ class EvalResult:
|
|
112 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
113 |
data_dict = {
|
114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
-
AutoEvalColumn.precision.name: self.precision.value.name,
|
116 |
-
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
117 |
-
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
118 |
-
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
119 |
-
AutoEvalColumn.architecture.name: self.architecture,
|
120 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
121 |
-
AutoEvalColumn.revision.name: self.revision,
|
122 |
AutoEvalColumn.average.name: average,
|
123 |
-
AutoEvalColumn.license.name: self.license,
|
124 |
-
AutoEvalColumn.likes.name: self.likes,
|
125 |
-
AutoEvalColumn.params.name: self.num_params,
|
126 |
-
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
127 |
}
|
128 |
|
129 |
for task in Tasks:
|
@@ -164,10 +164,10 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
164 |
continue
|
165 |
|
166 |
# Sort the files by date
|
167 |
-
try:
|
168 |
-
|
169 |
-
except dateutil.parser._parser.ParserError:
|
170 |
-
|
171 |
|
172 |
for file in files:
|
173 |
model_result_filepaths.append(os.path.join(root, file))
|
|
|
112 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
113 |
data_dict = {
|
114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
+
# AutoEvalColumn.precision.name: self.precision.value.name,
|
116 |
+
# AutoEvalColumn.model_type.name: self.model_type.value.name,
|
117 |
+
# AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
118 |
+
# AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
119 |
+
# AutoEvalColumn.architecture.name: self.architecture,
|
120 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
121 |
+
# AutoEvalColumn.revision.name: self.revision,
|
122 |
AutoEvalColumn.average.name: average,
|
123 |
+
# AutoEvalColumn.license.name: self.license,
|
124 |
+
# AutoEvalColumn.likes.name: self.likes,
|
125 |
+
# AutoEvalColumn.params.name: self.num_params,
|
126 |
+
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
127 |
}
|
128 |
|
129 |
for task in Tasks:
|
|
|
164 |
continue
|
165 |
|
166 |
# Sort the files by date
|
167 |
+
# try:
|
168 |
+
# files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
169 |
+
# except dateutil.parser._parser.ParserError:
|
170 |
+
# files = [files[-1]]
|
171 |
|
172 |
for file in files:
|
173 |
model_result_filepaths.append(os.path.join(root, file))
|
src/populate.py
CHANGED
@@ -14,7 +14,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
-
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=
|
18 |
df = df[cols].round(decimals=2)
|
19 |
|
20 |
# filter out if any of the benchmarks have not been produced
|
@@ -34,7 +34,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
34 |
data = json.load(fp)
|
35 |
|
36 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
37 |
-
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
38 |
|
39 |
all_evals.append(data)
|
40 |
elif ".md" not in entry:
|
@@ -46,13 +46,13 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
46 |
data = json.load(fp)
|
47 |
|
48 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
49 |
-
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
50 |
all_evals.append(data)
|
51 |
|
52 |
-
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
53 |
-
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
54 |
-
finished_list = [e for e in all_evals
|
55 |
-
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
56 |
-
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
57 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
58 |
-
return df_finished[cols]
|
|
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
+
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=True)
|
18 |
df = df[cols].round(decimals=2)
|
19 |
|
20 |
# filter out if any of the benchmarks have not been produced
|
|
|
34 |
data = json.load(fp)
|
35 |
|
36 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
37 |
+
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
38 |
|
39 |
all_evals.append(data)
|
40 |
elif ".md" not in entry:
|
|
|
46 |
data = json.load(fp)
|
47 |
|
48 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
49 |
+
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
50 |
all_evals.append(data)
|
51 |
|
52 |
+
# pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
53 |
+
# running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
54 |
+
finished_list = [e for e in all_evals]
|
55 |
+
# df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
56 |
+
# df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
57 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
58 |
+
return df_finished[cols]
|