Spaces:
Sleeping
Sleeping
update
Browse files
app.py
CHANGED
@@ -38,12 +38,13 @@ def load_data(files, model_type):
|
|
38 |
|
39 |
# Load and label all data
|
40 |
data = load_data(noncot_results, "Text Only")
|
|
|
41 |
vision_data = load_data(vision_results, "Vision")
|
42 |
cot_text_data = load_data(cot_text_results, "CoT Text Only")
|
43 |
# cot_vision_data = load_data(cot_vision_results, "CoT Vision")
|
44 |
|
45 |
# Combine all data into a single DataFrame
|
46 |
-
all_data = pd.concat([
|
47 |
|
48 |
all_model_names = all_data["Model Name"].unique()
|
49 |
all_text_only_model_names = list(
|
@@ -414,7 +415,7 @@ def generate_heatmap_for_intersection_model(model_name):
|
|
414 |
|
415 |
sns.despine(ax=ax, top=True, right=True, left=True, bottom=True)
|
416 |
|
417 |
-
plt.close(fig)
|
418 |
return fig
|
419 |
|
420 |
|
@@ -443,7 +444,7 @@ with gr.Blocks() as demo:
|
|
443 |
heatmap_image_qwen = gr.Image(label="", show_label=False)
|
444 |
leader_board.select(fn=load_heatmap_qwen, outputs=[heatmap_image_qwen])
|
445 |
|
446 |
-
with gr.Tab("Vision Benchmark"):
|
447 |
gr.Markdown("# Vision Benchmark Leaderboard")
|
448 |
leader_board_vision = gr.Dataframe(
|
449 |
vision_accuracy_df, headers=headers_with_icons
|
@@ -454,7 +455,7 @@ with gr.Blocks() as demo:
|
|
454 |
fn=load_vision_heatmap, outputs=[heatmap_image_vision]
|
455 |
)
|
456 |
|
457 |
-
with gr.Tab("Text-only Benchmark (CoT)"):
|
458 |
gr.Markdown("# Text-only Leaderboard (CoT)")
|
459 |
cot_leader_board_text = gr.Dataframe(
|
460 |
cot_text_accuracy_df, headers=headers_with_icons
|
@@ -499,7 +500,7 @@ with gr.Blocks() as demo:
|
|
499 |
queue=True,
|
500 |
)
|
501 |
|
502 |
-
with gr.Tab("Constraint Text-only Results (CoT)"):
|
503 |
gr.Markdown("## Constraint Text-only Leaderboard by first substrin (CoT)")
|
504 |
included_models_cot = gr.CheckboxGroup(
|
505 |
label="Models to include",
|
@@ -514,14 +515,14 @@ with gr.Blocks() as demo:
|
|
514 |
constrained_leader_board_text_cot = gr.Dataframe()
|
515 |
constrained_leader_board_plot_cot = gr.Plot()
|
516 |
|
517 |
-
with gr.Tab("Majority Vote (Subset 1)"):
|
518 |
gr.Markdown("## Majority Vote (Subset 1)")
|
519 |
intersection_leader_board = gr.Dataframe(
|
520 |
intersection_df_acc, headers=headers_with_icons
|
521 |
)
|
522 |
heatmap_image = gr.Plot(label="Model Heatmap")
|
523 |
|
524 |
-
with gr.Tab("Text-only Benchmark (deprecated)"):
|
525 |
gr.Markdown("# Text-only Leaderboard")
|
526 |
leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
|
527 |
gr.Markdown("## Heatmap")
|
|
|
38 |
|
39 |
# Load and label all data
|
40 |
data = load_data(noncot_results, "Text Only")
|
41 |
+
data_qwen = load_data(noncot_results_qwen, "Text Only")
|
42 |
vision_data = load_data(vision_results, "Vision")
|
43 |
cot_text_data = load_data(cot_text_results, "CoT Text Only")
|
44 |
# cot_vision_data = load_data(cot_vision_results, "CoT Vision")
|
45 |
|
46 |
# Combine all data into a single DataFrame
|
47 |
+
all_data = pd.concat([data_qwen, vision_data, cot_text_data], ignore_index=True)
|
48 |
|
49 |
all_model_names = all_data["Model Name"].unique()
|
50 |
all_text_only_model_names = list(
|
|
|
415 |
|
416 |
sns.despine(ax=ax, top=True, right=True, left=True, bottom=True)
|
417 |
|
418 |
+
plt.close(fig)
|
419 |
return fig
|
420 |
|
421 |
|
|
|
444 |
heatmap_image_qwen = gr.Image(label="", show_label=False)
|
445 |
leader_board.select(fn=load_heatmap_qwen, outputs=[heatmap_image_qwen])
|
446 |
|
447 |
+
with gr.Tab("Vision Benchmark", visible=False):
|
448 |
gr.Markdown("# Vision Benchmark Leaderboard")
|
449 |
leader_board_vision = gr.Dataframe(
|
450 |
vision_accuracy_df, headers=headers_with_icons
|
|
|
455 |
fn=load_vision_heatmap, outputs=[heatmap_image_vision]
|
456 |
)
|
457 |
|
458 |
+
with gr.Tab("Text-only Benchmark (CoT)", visible=False):
|
459 |
gr.Markdown("# Text-only Leaderboard (CoT)")
|
460 |
cot_leader_board_text = gr.Dataframe(
|
461 |
cot_text_accuracy_df, headers=headers_with_icons
|
|
|
500 |
queue=True,
|
501 |
)
|
502 |
|
503 |
+
with gr.Tab("Constraint Text-only Results (CoT)", visible=False):
|
504 |
gr.Markdown("## Constraint Text-only Leaderboard by first substrin (CoT)")
|
505 |
included_models_cot = gr.CheckboxGroup(
|
506 |
label="Models to include",
|
|
|
515 |
constrained_leader_board_text_cot = gr.Dataframe()
|
516 |
constrained_leader_board_plot_cot = gr.Plot()
|
517 |
|
518 |
+
with gr.Tab("Majority Vote (Subset 1)", visible=False):
|
519 |
gr.Markdown("## Majority Vote (Subset 1)")
|
520 |
intersection_leader_board = gr.Dataframe(
|
521 |
intersection_df_acc, headers=headers_with_icons
|
522 |
)
|
523 |
heatmap_image = gr.Plot(label="Model Heatmap")
|
524 |
|
525 |
+
with gr.Tab("Text-only Benchmark (deprecated)", visible=False):
|
526 |
gr.Markdown("# Text-only Leaderboard")
|
527 |
leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
|
528 |
gr.Markdown("## Heatmap")
|