import gradio as gr import pandas as pd from glob import glob # Load text benchmark results csv_results = glob("results/*.pkl") # Load vision benchmark results vision_results = glob("results-vision/*.pkl") # Load the csv files into a dict with keys being name of the file and values being the data data = {file: pd.read_pickle(file) for file in csv_results} # Load the vision files into a dict vision_data = {file: pd.read_pickle(file) for file in vision_results} def calculate_accuracy(df): return df["parsed_judge_response"].mean() * 100 def accuracy_breakdown(df): # 4 level accuracy return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values # Define the column names with icons headers_with_icons = [ "🤖 Model Name", "⭐ Overall", "📈 Level 1", "🔍 Level 2", "📘 Level 3", "🔬 Level 4", ] # Process text benchmark data accuracy = {file: calculate_accuracy(data[file]) for file in data} data_for_df = [] for file, df in data.items(): overall_accuracy = round(calculate_accuracy(df), 2) breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)] model_name = file.split("/")[-1].replace(".pkl", "") data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy) column_names = [ "Model Name", "Overall Accuracy", "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy", "Level 4 Accuracy", ] # accuracy_df = pd.DataFrame(data_for_df, columns=column_names) # accuracy_df.columns = headers_with_icons # accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True) # After creating the DataFrame and before sorting accuracy_df = pd.DataFrame(data_for_df, columns=column_names) accuracy_df = accuracy_df.round(1) # Round to one decimal place accuracy_df = accuracy_df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x) accuracy_df.columns = headers_with_icons accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True) # Process vision benchmark data vision_data_for_df = [] for file, df in vision_data.items(): overall_accuracy = round(calculate_accuracy(df), 2) breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)] model_name = file.split("/")[-1].replace(".pkl", "") vision_data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy) # vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names) # vision_accuracy_df.columns = headers_with_icons # vision_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True) # Do the same for vision_accuracy_df vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names) vision_accuracy_df = vision_accuracy_df.round(1) # Round to one decimal place vision_accuracy_df = vision_accuracy_df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x) vision_accuracy_df.columns = headers_with_icons vision_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True) def load_heatmap(evt: gr.SelectData): heatmap_image = gr.Image(f"results/{evt.value}.jpg") return heatmap_image def load_vision_heatmap(evt: gr.SelectData): heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg") return heatmap_image with gr.Blocks() as demo: gr.Markdown("# FSM Benchmark Leaderboard") with gr.Tab("Text-only Benchmark"): gr.Markdown("# Text-only Leaderboard") leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons) gr.Markdown("## Heatmap") heatmap_image = gr.Image(label="", show_label=False) leader_board.select(fn=load_heatmap, outputs=[heatmap_image]) with gr.Tab("Vision Benchmark"): gr.Markdown("# Vision Benchmark Leaderboard") leader_board_vision = gr.Dataframe(vision_accuracy_df, headers=headers_with_icons) gr.Markdown("## Heatmap") heatmap_image_vision = gr.Image(label="", show_label=False) leader_board_vision.select(fn=load_vision_heatmap, outputs=[heatmap_image_vision]) demo.launch()