Spaces:

FSMBench
/

Leaderboard

Sleeping

File size: 8,258 Bytes

import gradio as gr
import pandas as pd
from glob import glob


# Load text benchmark results
csv_results = glob("results/*.pkl")
# Load vision benchmark results
vision_results = glob("results-vision/*.pkl")
# Load CoT text benchmark results
cot_text_results = glob("results-cot/*.pkl")
# Load CoT vision benchmark results
cot_vision_results = glob("results-vision-CoT/*.pkl")

# Function to load data, add model type and name
def load_data(files, model_type):
    data = []
    for file in files:
        df = pd.read_pickle(file)
        df["Model Type"] = model_type
        df["Model Name"] = file.split("/")[-1].replace(".pkl", "")
        data.append(df)
    return pd.concat(data, ignore_index=True)


# Load and label all data
data = load_data(csv_results, "Text Only")
vision_data = load_data(vision_results, "Vision")
cot_text_data = load_data(cot_text_results, "CoT Text Only")
cot_vision_data = load_data(cot_vision_results, "CoT Vision")

# Combine all data into a single DataFrame
all_data = pd.concat(
    [data, vision_data, cot_text_data, cot_vision_data], ignore_index=True
)

all_model_names = all_data["Model Name"].unique()
all_text_only_model_names = list(
    all_data[all_data["Model Type"] == "Text Only"]["Model Name"].unique()
)
print(all_text_only_model_names)

## Continue with the cold code --
# TODO: Update me to read from all_data for later

# Load the csv files into a dict with keys being name of the file and values being the data
data = {file: pd.read_pickle(file) for file in csv_results}
# Load the vision files into a dict
vision_data = {file: pd.read_pickle(file) for file in vision_results}
# Load the CoT text files into a dict
cot_text_data = {file: pd.read_pickle(file) for file in cot_text_results}
# Load the CoT vision files into a dict
cot_vision_data = {file: pd.read_pickle(file) for file in cot_vision_results}


def calculate_accuracy(df):
    return df["parsed_judge_response"].mean() * 100


def accuracy_breakdown(df):
    # 4 level accuracy
    return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values


# Define the column names with icons
headers_with_icons = [
    "🤖 Model Name",
    "⭐ Overall",
    "📈 Level 1",
    "🔍 Level 2",
    "📘 Level 3",
    "🔬 Level 4",
]

column_names = [
    "Model Name",
    "Overall Accuracy",
    "Level 1 Accuracy",
    "Level 2 Accuracy",
    "Level 3 Accuracy",
    "Level 4 Accuracy",
]

# Function to process data
def process_data(data):
    data_for_df = []
    for file, df in data.items():
        overall_accuracy = round(calculate_accuracy(df), 2)
        breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
        model_name = file.split("/")[-1].replace(".pkl", "")
        data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
    return data_for_df


# Process all data
text_data_for_df = process_data(data)
vision_data_for_df = process_data(vision_data)
cot_text_data_for_df = process_data(cot_text_data)
cot_vision_data_for_df = process_data(cot_vision_data)

# Create DataFrames
accuracy_df = pd.DataFrame(text_data_for_df, columns=column_names)
vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
cot_text_accuracy_df = pd.DataFrame(cot_text_data_for_df, columns=column_names)
cot_vision_accuracy_df = pd.DataFrame(cot_vision_data_for_df, columns=column_names)

# Function to finalize DataFrame
def finalize_df(df):
    df = df.round(1)  # Round to one decimal place
    df = df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
    df.columns = headers_with_icons
    df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
    return df


# Finalize all DataFrames
accuracy_df = finalize_df(accuracy_df)
vision_accuracy_df = finalize_df(vision_accuracy_df)
cot_text_accuracy_df = finalize_df(cot_text_accuracy_df)
cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df)


def load_heatmap(evt: gr.SelectData):
    heatmap_image = gr.Image(f"results/{evt.value}.jpg")
    return heatmap_image


def load_vision_heatmap(evt: gr.SelectData):
    heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg")
    return heatmap_image


def load_cot_heatmap(evt: gr.SelectData):
    heatmap_image = gr.Image(f"results-cot/{evt.value}.jpg")
    return heatmap_image


def load_cot_vision_heatmap(evt: gr.SelectData):
    heatmap_image = gr.Image(f"results-vision-CoT/{evt.value}.jpg")
    return heatmap_image


def calculate_order_by_first_substring(selected_models):

    first_columns = all_data[all_data["substring_index"] == 1]
    query_ids_df = first_columns[first_columns["Model Type"] == "Text Only"]
    query_ids_df = query_ids_df[query_ids_df["Model Name"].isin(selected_models)]

    query_ids_df = query_ids_df.groupby("query_id").filter(
        lambda x: x["parsed_judge_response"].eq(1).all()
    )

    fsm_ids = query_ids_df.fsm_id.unique()

    text_only = all_data[all_data["Model Type"] == "Text Only"]
    text_only_filtered = text_only[text_only["fsm_id"].isin(fsm_ids)]

    query_ids = text_only_filtered.query_id.unique()
    text_only_filtered = (
        text_only_filtered.groupby(["Model Name"])["parsed_judge_response"]
        .mean()
        .reset_index()
    )

    text_only_filtered["Accuracy"] = text_only_filtered["parsed_judge_response"] * 100
    text_only_filtered.drop("parsed_judge_response", axis=1, inplace=True)

    text_only_filtered["Accuracy"] = text_only_filtered["Accuracy"].apply(
        lambda x: round(x, 2)
    )
    text_only_filtered.sort_values("Accuracy", ascending=False, inplace=True)

    number_of_queries = len(query_ids)
    number_of_fsms = len(fsm_ids)

    return text_only_filtered, number_of_queries, number_of_fsms


with gr.Blocks() as demo:
    gr.Markdown("# FSM Benchmark Leaderboard")
    with gr.Tab("Text-only Benchmark"):
        gr.Markdown("# Text-only Leaderboard")
        leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
        gr.Markdown("## Heatmap")
        heatmap_image = gr.Image(label="", show_label=False)
        leader_board.select(fn=load_heatmap, outputs=[heatmap_image])

    with gr.Tab("Vision Benchmark"):
        gr.Markdown("# Vision Benchmark Leaderboard")
        leader_board_vision = gr.Dataframe(
            vision_accuracy_df, headers=headers_with_icons
        )
        gr.Markdown("## Heatmap")
        heatmap_image_vision = gr.Image(label="", show_label=False)
        leader_board_vision.select(
            fn=load_vision_heatmap, outputs=[heatmap_image_vision]
        )

    with gr.Tab("CoT Text-only Benchmark"):
        gr.Markdown("# CoT Text-only Leaderboard")
        cot_leader_board_text = gr.Dataframe(
            cot_text_accuracy_df, headers=headers_with_icons
        )
        gr.Markdown("## Heatmap")
        cot_heatmap_image_text = gr.Image(label="", show_label=False)
        cot_leader_board_text.select(
            fn=load_cot_heatmap, outputs=[cot_heatmap_image_text]
        )

    with gr.Tab("CoT Vision Benchmark"):
        gr.Markdown("# CoT Vision Benchmark Leaderboard")
        cot_leader_board_vision = gr.Dataframe(
            cot_vision_accuracy_df, headers=headers_with_icons
        )
        gr.Markdown("## Heatmap")
        cot_heatmap_image_vision = gr.Image(label="", show_label=False)
        cot_leader_board_vision.select(
            fn=load_cot_vision_heatmap, outputs=[cot_heatmap_image_vision]
        )

    with gr.Tab("Constraint Text-only Results"):
        gr.Markdown("## Constraint Text-only Leaderboard by first substring")
        included_models = gr.CheckboxGroup(
            label="Models to include",
            choices=all_text_only_model_names,
            value=all_text_only_model_names,
            interactive=True,
        )
        with gr.Row():
            number_of_queries = gr.Textbox(label="Number of included queries")

            number_of_fsms = gr.Textbox(label="Number of included  FSMs")

        constrained_leader_board_text = gr.Dataframe()

    included_models.select(
        fn=calculate_order_by_first_substring,
        inputs=[included_models],
        outputs=[constrained_leader_board_text, number_of_queries, number_of_fsms],
        queue=True,
    )

    demo.launch()