import gradio as gr import pandas as pd from glob import glob import matplotlib.pyplot as plt import seaborn as sns from matplotlib.colors import ListedColormap, BoundaryNorm from glob import glob import os # Load text benchmark results csv_results = glob("results/*.pkl") # Load vision benchmark results vision_results = glob("results-vision/*.pkl") # Load CoT text benchmark results cot_text_results = glob("results-cot/*.pkl") # Load CoT vision benchmark results # cot_vision_results = glob("results-vision-CoT/*.pkl") # Function to load data, add model type and name def load_data(files, model_type): data = [] for file in files: df = pd.read_pickle(file) df["Model Type"] = model_type df["Model Name"] = file.split("/")[-1].replace(".pkl", "") data.append(df) return pd.concat(data, ignore_index=True) # Load and label all data data = load_data(csv_results, "Text Only") vision_data = load_data(vision_results, "Vision") cot_text_data = load_data(cot_text_results, "CoT Text Only") # cot_vision_data = load_data(cot_vision_results, "CoT Vision") # Combine all data into a single DataFrame all_data = pd.concat([data, vision_data, cot_text_data], ignore_index=True) all_model_names = all_data["Model Name"].unique() all_text_only_model_names = list( all_data[all_data["Model Type"] == "Text Only"]["Model Name"].unique() ) all_cot_text_only_models = list( all_data[all_data["Model Type"] == "CoT Text Only"]["Model Name"].unique() ) text_only_filtered_raw = None text_only_filtered_raw_cot = None ## Continue with the cold code -- # TODO: Update me to read from all_data for later # Load the csv files into a dict with keys being name of the file and values being the data data = {file: pd.read_pickle(file) for file in csv_results} # Load the vision files into a dict vision_data = {file: pd.read_pickle(file) for file in vision_results} # Load the CoT text files into a dict cot_text_data = {file: pd.read_pickle(file) for file in cot_text_results} # Load the CoT vision files into a dict # cot_vision_data = {file: pd.read_pickle(file) for file in cot_vision_results} def calculate_accuracy(df): return df["parsed_judge_response"].mean() * 100 def accuracy_breakdown(df): # 4 level accuracy return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values # Define the column names with icons headers_with_icons = [ "🤖 Model Name", "⭐ Overall", "📈 Level 1", "🔍 Level 2", "📘 Level 3", "🔬 Level 4", ] column_names = [ "Model Name", "Overall Accuracy", "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy", "Level 4 Accuracy", ] # Function to process data def process_data(data): data_for_df = [] for file, df in data.items(): overall_accuracy = round(calculate_accuracy(df), 2) breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)] model_name = file.split("/")[-1].replace(".pkl", "") data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy) return data_for_df # Process all data text_data_for_df = process_data(data) vision_data_for_df = process_data(vision_data) cot_text_data_for_df = process_data(cot_text_data) # cot_vision_data_for_df = process_data(cot_vision_data) # Create DataFrames accuracy_df = pd.DataFrame(text_data_for_df, columns=column_names) vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names) cot_text_accuracy_df = pd.DataFrame(cot_text_data_for_df, columns=column_names) # cot_vision_accuracy_df = pd.DataFrame(cot_vision_data_for_df, columns=column_names) # Function to finalize DataFrame def finalize_df(df): df = df.round(1) # Round to one decimal place df = df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x) df.columns = headers_with_icons df.sort_values(by="⭐ Overall", ascending=False, inplace=True) return df # Finalize all DataFrames accuracy_df = finalize_df(accuracy_df) vision_accuracy_df = finalize_df(vision_accuracy_df) cot_text_accuracy_df = finalize_df(cot_text_accuracy_df) # cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df) def load_heatmap(evt: gr.SelectData): heatmap_image = gr.Image(f"results/{evt.value}.jpg") return heatmap_image def load_vision_heatmap(evt: gr.SelectData): heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg") return heatmap_image def load_cot_heatmap(evt: gr.SelectData): heatmap_image = gr.Image(f"results-cot/{evt.value}.jpg") return heatmap_image def load_cot_vision_heatmap(evt: gr.SelectData): heatmap_image = gr.Image(f"results-vision-CoT/{evt.value}.jpg") return heatmap_image def calculate_order_by_first_substring(selected_models): global text_only_filtered_raw first_columns = all_data[all_data["substring_index"] == 1] query_ids_df = first_columns[first_columns["Model Type"] == "Text Only"] query_ids_df = query_ids_df[query_ids_df["Model Name"].isin(selected_models)] query_ids_df = query_ids_df.groupby("query_id").filter( lambda x: x["parsed_judge_response"].eq(1).all() ) fsm_ids = query_ids_df.fsm_id.unique() text_only = all_data[all_data["Model Type"] == "Text Only"] text_only_filtered = text_only[text_only["fsm_id"].isin(fsm_ids)] text_only_filtered_raw = text_only_filtered.copy() query_ids = text_only_filtered.query_id.unique() text_only_filtered = ( text_only_filtered.groupby(["Model Name"])["parsed_judge_response"] .mean() .reset_index() ) text_only_filtered["Accuracy"] = text_only_filtered["parsed_judge_response"] * 100 text_only_filtered.drop("parsed_judge_response", axis=1, inplace=True) text_only_filtered["Accuracy"] = text_only_filtered["Accuracy"].apply( lambda x: round(x, 2) ) text_only_filtered.sort_values("Accuracy", ascending=False, inplace=True) number_of_queries = len(query_ids) number_of_fsms = len(fsm_ids) return text_only_filtered, number_of_queries, number_of_fsms def calculate_order_by_first_substring_cot(selected_models): global text_only_filtered_raw_cot first_columns = all_data[all_data["substring_index"] == 1] query_ids_df = first_columns[first_columns["Model Type"] == "CoT Text Only"] query_ids_df = query_ids_df[query_ids_df["Model Name"].isin(selected_models)] query_ids_df = query_ids_df.groupby("query_id").filter( lambda x: x["parsed_judge_response"].eq(1).all() ) fsm_ids = query_ids_df.fsm_id.unique() text_only = all_data[all_data["Model Type"] == "CoT Text Only"] text_only_filtered = text_only[text_only["fsm_id"].isin(fsm_ids)] text_only_filtered_raw_cot = text_only_filtered.copy() query_ids = text_only_filtered.query_id.unique() text_only_filtered = ( text_only_filtered.groupby(["Model Name"])["parsed_judge_response"] .mean() .reset_index() ) text_only_filtered["Accuracy"] = text_only_filtered["parsed_judge_response"] * 100 text_only_filtered.drop("parsed_judge_response", axis=1, inplace=True) text_only_filtered["Accuracy"] = text_only_filtered["Accuracy"].apply( lambda x: round(x, 2) ) text_only_filtered.sort_values("Accuracy", ascending=False, inplace=True) number_of_queries = len(query_ids) number_of_fsms = len(fsm_ids) return text_only_filtered, number_of_queries, number_of_fsms def generate_heatmap_for_specific_model(model_name): global text_only_filtered_raw cmap = ListedColormap(["lightblue", "red", "green"]) bounds = [-1.5, -0.5, 0.5, 1.5] norm = BoundaryNorm(bounds, cmap.N) model_df = text_only_filtered_raw[ text_only_filtered_raw["Model Name"] == model_name ] model_df["fsm_info"] = model_df.apply( lambda x: f"{x['num_states']} states, {x['num_alphabet']} alphabet", axis=1 ) model_df = model_df.sort_values(by=["num_states", "num_alphabet"]) pivot_df = ( model_df.pivot_table( index="fsm_info", columns="substring_index", values="parsed_judge_response", aggfunc="first", ) .fillna(-1) .astype(float) ) # Dynamically adjust figure size num_rows, num_cols = pivot_df.shape fig_width = max(12, num_cols * 0.5) # Adjust width per column fig_height = max(8, num_rows * 0.4) # Adjust height per row fig, ax = plt.subplots(figsize=(fig_width, fig_height)) sns.heatmap( pivot_df, cmap=cmap, linewidths=1, linecolor="black", norm=norm, cbar=False, square=True, ax=ax, ) plt.title(f"Heatmap for Model: {model_name}", fontsize=12) plt.xlabel("Substring Index") plt.ylabel("FSM (States, Alphabet)") plt.xticks(rotation=45) sns.despine(ax=ax, top=True, right=True, left=True, bottom=True) return fig def generate_heatmap_for_specific_model_cot(model_name): global text_only_filtered_raw_cot cmap = ListedColormap(["lightblue", "red", "green"]) bounds = [-1.5, -0.5, 0.5, 1.5] norm = BoundaryNorm(bounds, cmap.N) model_df = text_only_filtered_raw_cot[ text_only_filtered_raw_cot["Model Name"] == model_name ] model_df["fsm_info"] = model_df.apply( lambda x: f"{x['num_states']} states, {x['num_alphabet']} alphabet", axis=1 ) model_df = model_df.sort_values(by=["num_states", "num_alphabet"]) pivot_df = ( model_df.pivot_table( index="fsm_info", columns="substring_index", values="parsed_judge_response", aggfunc="first", ) .fillna(-1) .astype(float) ) # Dynamically adjust figure size num_rows, num_cols = pivot_df.shape fig_width = max(12, num_cols * 0.5) # Adjust width per column fig_height = max(8, num_rows * 0.4) # Adjust height per row fig, ax = plt.subplots(figsize=(fig_width, fig_height)) sns.heatmap( pivot_df, cmap=cmap, linewidths=1, linecolor="black", norm=norm, cbar=False, square=True, ax=ax, ) plt.title(f"Heatmap for Model: {model_name}", fontsize=12) plt.xlabel("Substring Index") plt.ylabel("FSM (States, Alphabet)") plt.xticks(rotation=45) sns.despine(ax=ax, top=True, right=True, left=True, bottom=True) return fig def show_constraint_heatmap(evt: gr.SelectData): model_name = evt.value return generate_heatmap_for_specific_model(model_name) def show_constraint_heatmap_cot(evt: gr.SelectData): model_name = evt.value return generate_heatmap_for_specific_model_cot(model_name) with gr.Blocks() as demo: gr.Markdown("# FSM Benchmark Leaderboard") with gr.Tab("Text-only Benchmark"): gr.Markdown("# Text-only Leaderboard") leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons) gr.Markdown("## Heatmap") heatmap_image = gr.Image(label="", show_label=False) leader_board.select(fn=load_heatmap, outputs=[heatmap_image]) with gr.Tab("Vision Benchmark"): gr.Markdown("# Vision Benchmark Leaderboard") leader_board_vision = gr.Dataframe( vision_accuracy_df, headers=headers_with_icons ) gr.Markdown("## Heatmap") heatmap_image_vision = gr.Image(label="", show_label=False) leader_board_vision.select( fn=load_vision_heatmap, outputs=[heatmap_image_vision] ) with gr.Tab("Text-only Benchmark (CoT)"): gr.Markdown("# Text-only Leaderboard (CoT)") cot_leader_board_text = gr.Dataframe( cot_text_accuracy_df, headers=headers_with_icons ) gr.Markdown("## Heatmap") cot_heatmap_image_text = gr.Image(label="", show_label=False) cot_leader_board_text.select( fn=load_cot_heatmap, outputs=[cot_heatmap_image_text] ) # with gr.Tab("Vision Benchmark (CoT)"): # gr.Markdown("# Vision Benchmark Leaderboard (CoT)") # cot_leader_board_vision = gr.Dataframe( # cot_vision_accuracy_df, headers=headers_with_icons # ) # gr.Markdown("## Heatmap") # cot_heatmap_image_vision = gr.Image(label="", show_label=False) # cot_leader_board_vision.select( # fn=load_cot_vision_heatmap, outputs=[cot_heatmap_image_vision] # ) with gr.Tab("Constraint Text-only Results"): gr.Markdown("## Constraint Text-only Leaderboard by first substring") included_models = gr.CheckboxGroup( label="Models to include", choices=all_text_only_model_names, value=all_text_only_model_names, interactive=True, ) with gr.Row(): number_of_queries = gr.Textbox(label="Number of included queries") number_of_fsms = gr.Textbox(label="Number of included FSMs") constrained_leader_board_text = gr.Dataframe() constrained_leader_board_plot = gr.Plot() included_models.select( fn=calculate_order_by_first_substring, inputs=[included_models], outputs=[constrained_leader_board_text, number_of_queries, number_of_fsms], queue=True, ) with gr.Tab("Constraint Text-only Results (CoT)"): gr.Markdown("## Constraint Text-only Leaderboard by first substrin (CoT)") included_models_cot = gr.CheckboxGroup( label="Models to include", choices=all_cot_text_only_models, value=all_cot_text_only_models, interactive=True, ) with gr.Row(): number_of_queries_cot = gr.Textbox(label="Number of included queries") number_of_fsms_cot = gr.Textbox(label="Number of included FSMs") constrained_leader_board_text_cot = gr.Dataframe() constrained_leader_board_plot_cot = gr.Plot() included_models_cot.select( fn=calculate_order_by_first_substring_cot, inputs=[included_models_cot], outputs=[ constrained_leader_board_text_cot, number_of_queries_cot, number_of_fsms_cot, ], queue=True, ) constrained_leader_board_text.select( fn=show_constraint_heatmap, outputs=[constrained_leader_board_plot] ) constrained_leader_board_text_cot.select( fn=show_constraint_heatmap_cot, outputs=[constrained_leader_board_plot_cot] ) demo.launch()