Spaces:
Sleeping
Sleeping
File size: 4,072 Bytes
a3c4484 e124052 a101f39 e124052 a3c4484 e124052 a3c4484 e124052 a3c4484 e124052 a3c4484 e124052 a3c4484 e124052 a3c4484 e124052 a101f39 e124052 a101f39 a3c4484 a101f39 856df6f e124052 856df6f e124052 a101f39 856df6f e124052 856df6f e124052 a101f39 e124052 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import gradio as gr
import pandas as pd
from glob import glob
# Load text benchmark results
csv_results = glob("results/*.pkl")
# Load vision benchmark results
vision_results = glob("results-vision/*.pkl")
# Load the csv files into a dict with keys being name of the file and values being the data
data = {file: pd.read_pickle(file) for file in csv_results}
# Load the vision files into a dict
vision_data = {file: pd.read_pickle(file) for file in vision_results}
def calculate_accuracy(df):
return df["parsed_judge_response"].mean() * 100
def accuracy_breakdown(df):
# 4 level accuracy
return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values
# Define the column names with icons
headers_with_icons = [
"π€ Model Name",
"β Overall",
"π Level 1",
"π Level 2",
"π Level 3",
"π¬ Level 4",
]
# Process text benchmark data
accuracy = {file: calculate_accuracy(data[file]) for file in data}
data_for_df = []
for file, df in data.items():
overall_accuracy = round(calculate_accuracy(df), 2)
breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
model_name = file.split("/")[-1].replace(".pkl", "")
data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
column_names = [
"Model Name",
"Overall Accuracy",
"Level 1 Accuracy",
"Level 2 Accuracy",
"Level 3 Accuracy",
"Level 4 Accuracy",
]
# accuracy_df = pd.DataFrame(data_for_df, columns=column_names)
# accuracy_df.columns = headers_with_icons
# accuracy_df.sort_values(by="β Overall", ascending=False, inplace=True)
# After creating the DataFrame and before sorting
accuracy_df = pd.DataFrame(data_for_df, columns=column_names)
accuracy_df = accuracy_df.round(1) # Round to one decimal place
accuracy_df = accuracy_df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
accuracy_df.columns = headers_with_icons
accuracy_df.sort_values(by="β Overall", ascending=False, inplace=True)
# Process vision benchmark data
vision_data_for_df = []
for file, df in vision_data.items():
overall_accuracy = round(calculate_accuracy(df), 2)
breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
model_name = file.split("/")[-1].replace(".pkl", "")
vision_data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
# vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
# vision_accuracy_df.columns = headers_with_icons
# vision_accuracy_df.sort_values(by="β Overall", ascending=False, inplace=True)
# Do the same for vision_accuracy_df
vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
vision_accuracy_df = vision_accuracy_df.round(1) # Round to one decimal place
vision_accuracy_df = vision_accuracy_df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
vision_accuracy_df.columns = headers_with_icons
vision_accuracy_df.sort_values(by="β Overall", ascending=False, inplace=True)
def load_heatmap(evt: gr.SelectData):
heatmap_image = gr.Image(f"results/{evt.value}.jpg")
return heatmap_image
def load_vision_heatmap(evt: gr.SelectData):
heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg")
return heatmap_image
with gr.Blocks() as demo:
gr.Markdown("# FSM Benchmark Leaderboard")
with gr.Tab("Text-only Benchmark"):
gr.Markdown("# Text-only Leaderboard")
leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
gr.Markdown("## Heatmap")
heatmap_image = gr.Image(label="", show_label=False)
leader_board.select(fn=load_heatmap, outputs=[heatmap_image])
with gr.Tab("Vision Benchmark"):
gr.Markdown("# Vision Benchmark Leaderboard")
leader_board_vision = gr.Dataframe(vision_accuracy_df, headers=headers_with_icons)
gr.Markdown("## Heatmap")
heatmap_image_vision = gr.Image(label="", show_label=False)
leader_board_vision.select(fn=load_vision_heatmap, outputs=[heatmap_image_vision])
demo.launch() |