File size: 8,258 Bytes
a3c4484
 
 
 
c1f8e27
e124052
a101f39
e124052
 
16b6bb4
 
 
 
a3c4484
c1f8e27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da19c93
 
 
 
c1f8e27
 
 
 
 
e124052
 
 
 
16b6bb4
 
 
 
 
a3c4484
 
 
 
16b6bb4
a3c4484
 
 
 
16b6bb4
a3c4484
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16b6bb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3c4484
16b6bb4
 
 
 
 
 
 
a3c4484
e124052
16b6bb4
 
 
 
 
e124052
 
a101f39
 
 
 
16b6bb4
e124052
 
 
a101f39
16b6bb4
 
 
 
 
 
 
 
 
 
 
c1f8e27
 
da19c93
 
 
c1f8e27
da19c93
 
 
c1f8e27
ce7f029
da19c93
 
 
 
ce7f029
da19c93
 
 
 
 
ce7f029
da19c93
 
 
 
 
 
 
c1f8e27
779f440
 
 
 
c1f8e27
 
a3c4484
a101f39
856df6f
e124052
856df6f
 
e124052
 
a101f39
856df6f
e124052
16b6bb4
 
 
856df6f
e124052
16b6bb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a101f39
c1f8e27
 
 
da19c93
 
 
779f440
c1f8e27
779f440
ce7f029
779f440
ce7f029
779f440
c1f8e27
 
779f440
da19c93
 
779f440
 
da19c93
c1f8e27
16b6bb4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import gradio as gr
import pandas as pd
from glob import glob


# Load text benchmark results
csv_results = glob("results/*.pkl")
# Load vision benchmark results
vision_results = glob("results-vision/*.pkl")
# Load CoT text benchmark results
cot_text_results = glob("results-cot/*.pkl")
# Load CoT vision benchmark results
cot_vision_results = glob("results-vision-CoT/*.pkl")

# Function to load data, add model type and name
def load_data(files, model_type):
    data = []
    for file in files:
        df = pd.read_pickle(file)
        df["Model Type"] = model_type
        df["Model Name"] = file.split("/")[-1].replace(".pkl", "")
        data.append(df)
    return pd.concat(data, ignore_index=True)


# Load and label all data
data = load_data(csv_results, "Text Only")
vision_data = load_data(vision_results, "Vision")
cot_text_data = load_data(cot_text_results, "CoT Text Only")
cot_vision_data = load_data(cot_vision_results, "CoT Vision")

# Combine all data into a single DataFrame
all_data = pd.concat(
    [data, vision_data, cot_text_data, cot_vision_data], ignore_index=True
)

all_model_names = all_data["Model Name"].unique()
all_text_only_model_names = list(
    all_data[all_data["Model Type"] == "Text Only"]["Model Name"].unique()
)
print(all_text_only_model_names)

## Continue with the cold code --
# TODO: Update me to read from all_data for later

# Load the csv files into a dict with keys being name of the file and values being the data
data = {file: pd.read_pickle(file) for file in csv_results}
# Load the vision files into a dict
vision_data = {file: pd.read_pickle(file) for file in vision_results}
# Load the CoT text files into a dict
cot_text_data = {file: pd.read_pickle(file) for file in cot_text_results}
# Load the CoT vision files into a dict
cot_vision_data = {file: pd.read_pickle(file) for file in cot_vision_results}


def calculate_accuracy(df):
    return df["parsed_judge_response"].mean() * 100


def accuracy_breakdown(df):
    # 4 level accuracy
    return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values


# Define the column names with icons
headers_with_icons = [
    "πŸ€– Model Name",
    "⭐ Overall",
    "πŸ“ˆ Level 1",
    "πŸ” Level 2",
    "πŸ“˜ Level 3",
    "πŸ”¬ Level 4",
]

column_names = [
    "Model Name",
    "Overall Accuracy",
    "Level 1 Accuracy",
    "Level 2 Accuracy",
    "Level 3 Accuracy",
    "Level 4 Accuracy",
]

# Function to process data
def process_data(data):
    data_for_df = []
    for file, df in data.items():
        overall_accuracy = round(calculate_accuracy(df), 2)
        breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
        model_name = file.split("/")[-1].replace(".pkl", "")
        data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
    return data_for_df


# Process all data
text_data_for_df = process_data(data)
vision_data_for_df = process_data(vision_data)
cot_text_data_for_df = process_data(cot_text_data)
cot_vision_data_for_df = process_data(cot_vision_data)

# Create DataFrames
accuracy_df = pd.DataFrame(text_data_for_df, columns=column_names)
vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
cot_text_accuracy_df = pd.DataFrame(cot_text_data_for_df, columns=column_names)
cot_vision_accuracy_df = pd.DataFrame(cot_vision_data_for_df, columns=column_names)

# Function to finalize DataFrame
def finalize_df(df):
    df = df.round(1)  # Round to one decimal place
    df = df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
    df.columns = headers_with_icons
    df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
    return df


# Finalize all DataFrames
accuracy_df = finalize_df(accuracy_df)
vision_accuracy_df = finalize_df(vision_accuracy_df)
cot_text_accuracy_df = finalize_df(cot_text_accuracy_df)
cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df)


def load_heatmap(evt: gr.SelectData):
    heatmap_image = gr.Image(f"results/{evt.value}.jpg")
    return heatmap_image


def load_vision_heatmap(evt: gr.SelectData):
    heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg")
    return heatmap_image


def load_cot_heatmap(evt: gr.SelectData):
    heatmap_image = gr.Image(f"results-cot/{evt.value}.jpg")
    return heatmap_image


def load_cot_vision_heatmap(evt: gr.SelectData):
    heatmap_image = gr.Image(f"results-vision-CoT/{evt.value}.jpg")
    return heatmap_image


def calculate_order_by_first_substring(selected_models):

    first_columns = all_data[all_data["substring_index"] == 1]
    query_ids_df = first_columns[first_columns["Model Type"] == "Text Only"]
    query_ids_df = query_ids_df[query_ids_df["Model Name"].isin(selected_models)]

    query_ids_df = query_ids_df.groupby("query_id").filter(
        lambda x: x["parsed_judge_response"].eq(1).all()
    )

    fsm_ids = query_ids_df.fsm_id.unique()

    text_only = all_data[all_data["Model Type"] == "Text Only"]
    text_only_filtered = text_only[text_only["fsm_id"].isin(fsm_ids)]

    query_ids = text_only_filtered.query_id.unique()
    text_only_filtered = (
        text_only_filtered.groupby(["Model Name"])["parsed_judge_response"]
        .mean()
        .reset_index()
    )

    text_only_filtered["Accuracy"] = text_only_filtered["parsed_judge_response"] * 100
    text_only_filtered.drop("parsed_judge_response", axis=1, inplace=True)

    text_only_filtered["Accuracy"] = text_only_filtered["Accuracy"].apply(
        lambda x: round(x, 2)
    )
    text_only_filtered.sort_values("Accuracy", ascending=False, inplace=True)

    number_of_queries = len(query_ids)
    number_of_fsms = len(fsm_ids)

    return text_only_filtered, number_of_queries, number_of_fsms


with gr.Blocks() as demo:
    gr.Markdown("# FSM Benchmark Leaderboard")
    with gr.Tab("Text-only Benchmark"):
        gr.Markdown("# Text-only Leaderboard")
        leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
        gr.Markdown("## Heatmap")
        heatmap_image = gr.Image(label="", show_label=False)
        leader_board.select(fn=load_heatmap, outputs=[heatmap_image])

    with gr.Tab("Vision Benchmark"):
        gr.Markdown("# Vision Benchmark Leaderboard")
        leader_board_vision = gr.Dataframe(
            vision_accuracy_df, headers=headers_with_icons
        )
        gr.Markdown("## Heatmap")
        heatmap_image_vision = gr.Image(label="", show_label=False)
        leader_board_vision.select(
            fn=load_vision_heatmap, outputs=[heatmap_image_vision]
        )

    with gr.Tab("CoT Text-only Benchmark"):
        gr.Markdown("# CoT Text-only Leaderboard")
        cot_leader_board_text = gr.Dataframe(
            cot_text_accuracy_df, headers=headers_with_icons
        )
        gr.Markdown("## Heatmap")
        cot_heatmap_image_text = gr.Image(label="", show_label=False)
        cot_leader_board_text.select(
            fn=load_cot_heatmap, outputs=[cot_heatmap_image_text]
        )

    with gr.Tab("CoT Vision Benchmark"):
        gr.Markdown("# CoT Vision Benchmark Leaderboard")
        cot_leader_board_vision = gr.Dataframe(
            cot_vision_accuracy_df, headers=headers_with_icons
        )
        gr.Markdown("## Heatmap")
        cot_heatmap_image_vision = gr.Image(label="", show_label=False)
        cot_leader_board_vision.select(
            fn=load_cot_vision_heatmap, outputs=[cot_heatmap_image_vision]
        )

    with gr.Tab("Constraint Text-only Results"):
        gr.Markdown("## Constraint Text-only Leaderboard by first substring")
        included_models = gr.CheckboxGroup(
            label="Models to include",
            choices=all_text_only_model_names,
            value=all_text_only_model_names,
            interactive=True,
        )
        with gr.Row():
            number_of_queries = gr.Textbox(label="Number of included queries")

            number_of_fsms = gr.Textbox(label="Number of included  FSMs")

        constrained_leader_board_text = gr.Dataframe()

    included_models.select(
        fn=calculate_order_by_first_substring,
        inputs=[included_models],
        outputs=[constrained_leader_board_text, number_of_queries, number_of_fsms],
        queue=True,
    )

    demo.launch()