File size: 4,072 Bytes
a3c4484
 
 
 
e124052
a101f39
e124052
 
a3c4484
e124052
 
 
 
a3c4484
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e124052
a3c4484
 
 
 
 
 
e124052
a3c4484
 
 
 
 
 
 
 
 
 
 
e124052
 
 
 
 
a3c4484
e124052
 
a3c4484
 
 
 
e124052
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a101f39
 
 
 
e124052
 
 
a101f39
a3c4484
a101f39
856df6f
e124052
856df6f
 
e124052
 
a101f39
856df6f
e124052
 
856df6f
e124052
 
a101f39
e124052
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import gradio as gr
import pandas as pd
from glob import glob

# Load text benchmark results
csv_results = glob("results/*.pkl")
# Load vision benchmark results
vision_results = glob("results-vision/*.pkl")

# Load the csv files into a dict with keys being name of the file and values being the data
data = {file: pd.read_pickle(file) for file in csv_results}
# Load the vision files into a dict
vision_data = {file: pd.read_pickle(file) for file in vision_results}

def calculate_accuracy(df):
    return df["parsed_judge_response"].mean() * 100

def accuracy_breakdown(df):
    # 4 level accuracy
    return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values

# Define the column names with icons
headers_with_icons = [
    "πŸ€– Model Name",
    "⭐ Overall",
    "πŸ“ˆ Level 1",
    "πŸ” Level 2",
    "πŸ“˜ Level 3",
    "πŸ”¬ Level 4",
]

# Process text benchmark data
accuracy = {file: calculate_accuracy(data[file]) for file in data}
data_for_df = []

for file, df in data.items():
    overall_accuracy = round(calculate_accuracy(df), 2)
    breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
    model_name = file.split("/")[-1].replace(".pkl", "")
    data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)

column_names = [
    "Model Name",
    "Overall Accuracy",
    "Level 1 Accuracy",
    "Level 2 Accuracy",
    "Level 3 Accuracy",
    "Level 4 Accuracy",
]

# accuracy_df = pd.DataFrame(data_for_df, columns=column_names)
# accuracy_df.columns = headers_with_icons
# accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)

# After creating the DataFrame and before sorting
accuracy_df = pd.DataFrame(data_for_df, columns=column_names)
accuracy_df = accuracy_df.round(1)  # Round to one decimal place
accuracy_df = accuracy_df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
accuracy_df.columns = headers_with_icons
accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)


# Process vision benchmark data
vision_data_for_df = []

for file, df in vision_data.items():
    overall_accuracy = round(calculate_accuracy(df), 2)
    breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
    model_name = file.split("/")[-1].replace(".pkl", "")
    vision_data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)

# vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
# vision_accuracy_df.columns = headers_with_icons
# vision_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)

# Do the same for vision_accuracy_df
vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
vision_accuracy_df = vision_accuracy_df.round(1)  # Round to one decimal place
vision_accuracy_df = vision_accuracy_df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
vision_accuracy_df.columns = headers_with_icons
vision_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)

def load_heatmap(evt: gr.SelectData):
    heatmap_image = gr.Image(f"results/{evt.value}.jpg")
    return heatmap_image

def load_vision_heatmap(evt: gr.SelectData):
    heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg")
    return heatmap_image

with gr.Blocks() as demo:
    gr.Markdown("# FSM Benchmark Leaderboard")
    with gr.Tab("Text-only Benchmark"):
        gr.Markdown("# Text-only Leaderboard")
        leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
        gr.Markdown("## Heatmap")
        heatmap_image = gr.Image(label="", show_label=False)
        leader_board.select(fn=load_heatmap, outputs=[heatmap_image])

    with gr.Tab("Vision Benchmark"):
        gr.Markdown("# Vision Benchmark Leaderboard")
        leader_board_vision = gr.Dataframe(vision_accuracy_df, headers=headers_with_icons)
        gr.Markdown("## Heatmap")
        heatmap_image_vision = gr.Image(label="", show_label=False)
        leader_board_vision.select(fn=load_vision_heatmap, outputs=[heatmap_image_vision])

    demo.launch()