import gradio as gr import pandas as pd from glob import glob # Load text benchmark results csv_results = glob("results/*.pkl") # Load vision benchmark results vision_results = glob("results-vision/*.pkl") # Load CoT text benchmark results cot_text_results = glob("results-cot/*.pkl") # Load CoT vision benchmark results cot_vision_results = glob("results-vision-CoT/*.pkl") # Function to load data, add model type and name def load_data(files, model_type): data = [] for file in files: df = pd.read_pickle(file) df["Model Type"] = model_type df["Model Name"] = file.split("/")[-1].replace(".pkl", "") data.append(df) return pd.concat(data, ignore_index=True) # Load and label all data data = load_data(csv_results, "Text Only") vision_data = load_data(vision_results, "Vision") cot_text_data = load_data(cot_text_results, "CoT Text Only") cot_vision_data = load_data(cot_vision_results, "CoT Vision") # Combine all data into a single DataFrame all_data = pd.concat( [data, vision_data, cot_text_data, cot_vision_data], ignore_index=True ) all_model_names = all_data["Model Name"].unique() all_text_only_model_names = list( all_data[all_data["Model Type"] == "Text Only"]["Model Name"].unique() ) print(all_text_only_model_names) ## Continue with the cold code -- # TODO: Update me to read from all_data for later # Load the csv files into a dict with keys being name of the file and values being the data data = {file: pd.read_pickle(file) for file in csv_results} # Load the vision files into a dict vision_data = {file: pd.read_pickle(file) for file in vision_results} # Load the CoT text files into a dict cot_text_data = {file: pd.read_pickle(file) for file in cot_text_results} # Load the CoT vision files into a dict cot_vision_data = {file: pd.read_pickle(file) for file in cot_vision_results} def calculate_accuracy(df): return df["parsed_judge_response"].mean() * 100 def accuracy_breakdown(df): # 4 level accuracy return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values # Define the column names with icons headers_with_icons = [ "🤖 Model Name", "⭐ Overall", "📈 Level 1", "🔍 Level 2", "📘 Level 3", "🔬 Level 4", ] column_names = [ "Model Name", "Overall Accuracy", "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy", "Level 4 Accuracy", ] # Function to process data def process_data(data): data_for_df = [] for file, df in data.items(): overall_accuracy = round(calculate_accuracy(df), 2) breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)] model_name = file.split("/")[-1].replace(".pkl", "") data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy) return data_for_df # Process all data text_data_for_df = process_data(data) vision_data_for_df = process_data(vision_data) cot_text_data_for_df = process_data(cot_text_data) cot_vision_data_for_df = process_data(cot_vision_data) # Create DataFrames accuracy_df = pd.DataFrame(text_data_for_df, columns=column_names) vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names) cot_text_accuracy_df = pd.DataFrame(cot_text_data_for_df, columns=column_names) cot_vision_accuracy_df = pd.DataFrame(cot_vision_data_for_df, columns=column_names) # Function to finalize DataFrame def finalize_df(df): df = df.round(1) # Round to one decimal place df = df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x) df.columns = headers_with_icons df.sort_values(by="⭐ Overall", ascending=False, inplace=True) return df # Finalize all DataFrames accuracy_df = finalize_df(accuracy_df) vision_accuracy_df = finalize_df(vision_accuracy_df) cot_text_accuracy_df = finalize_df(cot_text_accuracy_df) cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df) def load_heatmap(evt: gr.SelectData): heatmap_image = gr.Image(f"results/{evt.value}.jpg") return heatmap_image def load_vision_heatmap(evt: gr.SelectData): heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg") return heatmap_image def load_cot_heatmap(evt: gr.SelectData): heatmap_image = gr.Image(f"results-cot/{evt.value}.jpg") return heatmap_image def load_cot_vision_heatmap(evt: gr.SelectData): heatmap_image = gr.Image(f"results-vision-CoT/{evt.value}.jpg") return heatmap_image def calculate_order_by_first_substring(selected_models): first_columns = all_data[all_data["substring_index"] == 1] query_ids_df = first_columns[first_columns["Model Type"] == "Text Only"] query_ids_df = query_ids_df[query_ids_df["Model Name"].isin(selected_models)] query_ids_df = query_ids_df.groupby("query_id").filter( lambda x: x["parsed_judge_response"].eq(1).all() ) fsm_ids = query_ids_df.fsm_id.unique() text_only = all_data[all_data["Model Type"] == "Text Only"] text_only_filtered = text_only[text_only["fsm_id"].isin(fsm_ids)] query_ids = text_only_filtered.query_id.unique() text_only_filtered = ( text_only_filtered.groupby(["Model Name"])["parsed_judge_response"] .mean() .reset_index() ) text_only_filtered["Accuracy"] = text_only_filtered["parsed_judge_response"] * 100 text_only_filtered.drop("parsed_judge_response", axis=1, inplace=True) text_only_filtered["Accuracy"] = text_only_filtered["Accuracy"].apply( lambda x: round(x, 2) ) text_only_filtered.sort_values("Accuracy", ascending=False, inplace=True) number_of_queries = len(query_ids) number_of_fsms = len(fsm_ids) return text_only_filtered, number_of_queries, number_of_fsms with gr.Blocks() as demo: gr.Markdown("# FSM Benchmark Leaderboard") with gr.Tab("Text-only Benchmark"): gr.Markdown("# Text-only Leaderboard") leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons) gr.Markdown("## Heatmap") heatmap_image = gr.Image(label="", show_label=False) leader_board.select(fn=load_heatmap, outputs=[heatmap_image]) with gr.Tab("Vision Benchmark"): gr.Markdown("# Vision Benchmark Leaderboard") leader_board_vision = gr.Dataframe( vision_accuracy_df, headers=headers_with_icons ) gr.Markdown("## Heatmap") heatmap_image_vision = gr.Image(label="", show_label=False) leader_board_vision.select( fn=load_vision_heatmap, outputs=[heatmap_image_vision] ) with gr.Tab("CoT Text-only Benchmark"): gr.Markdown("# CoT Text-only Leaderboard") cot_leader_board_text = gr.Dataframe( cot_text_accuracy_df, headers=headers_with_icons ) gr.Markdown("## Heatmap") cot_heatmap_image_text = gr.Image(label="", show_label=False) cot_leader_board_text.select( fn=load_cot_heatmap, outputs=[cot_heatmap_image_text] ) with gr.Tab("CoT Vision Benchmark"): gr.Markdown("# CoT Vision Benchmark Leaderboard") cot_leader_board_vision = gr.Dataframe( cot_vision_accuracy_df, headers=headers_with_icons ) gr.Markdown("## Heatmap") cot_heatmap_image_vision = gr.Image(label="", show_label=False) cot_leader_board_vision.select( fn=load_cot_vision_heatmap, outputs=[cot_heatmap_image_vision] ) with gr.Tab("Constraint Text-only Results"): gr.Markdown("## Constraint Text-only Leaderboard by first substring") included_models = gr.CheckboxGroup( label="Models to include", choices=all_text_only_model_names, value=all_text_only_model_names, interactive=True, ) with gr.Row(): number_of_queries = gr.Textbox(label="Number of included queries") number_of_fsms = gr.Textbox(label="Number of included FSMs") constrained_leader_board_text = gr.Dataframe() included_models.select( fn=calculate_order_by_first_substring, inputs=[included_models], outputs=[constrained_leader_board_text, number_of_queries, number_of_fsms], queue=True, ) demo.launch()