Spaces:

FSMBench
/

Leaderboard

Sleeping

App Files Files Community

taesiri commited on Apr 5, 2024

Commit

e124052

1 Parent(s): 856df6f

update

Browse files

Files changed (5) hide show

.gitattributes +4 -0
app.py +47 -28
results-vision/gpt-4v-vision-preview.jpg +3 -0
results-vision/gpt-4v-vision-preview.pkl +3 -0
results-vision/gpt-4v-vision-preview.png +3 -0

.gitattributes CHANGED Viewed

@@ -81,3 +81,7 @@ results/Qwen1.5-72B-Chat.pkl filter=lfs diff=lfs merge=lfs -text
 results/Claude-3-Opus.pkl filter=lfs diff=lfs merge=lfs -text
 results/Claude-3-Opus.png filter=lfs diff=lfs merge=lfs -text
 results/GPT-4-0125-preview.png filter=lfs diff=lfs merge=lfs -text

 results/Claude-3-Opus.pkl filter=lfs diff=lfs merge=lfs -text
 results/Claude-3-Opus.png filter=lfs diff=lfs merge=lfs -text
 results/GPT-4-0125-preview.png filter=lfs diff=lfs merge=lfs -text
+results/GPT-4-0125-preview.pkl filter=lfs diff=lfs merge=lfs -text
+results-vision/gpt-4v-vision-preview.png filter=lfs diff=lfs merge=lfs -text
+results-vision/gpt-4v-vision-preview.jpg filter=lfs diff=lfs merge=lfs -text
+results-vision/gpt-4v-vision-preview.pkl filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -2,21 +2,23 @@ import gradio as gr
 import pandas as pd
 from glob import glob
 csv_results = glob("results/*.pkl")
-# load the csv files into a dict with keys being name of the file and values being the data
-data = {file: pd.read_pickle(file) for file in csv_results}
 def calculate_accuracy(df):
     return df["parsed_judge_response"].mean() * 100
 def accuracy_breakdown(df):
     # 4 level accuracy
     return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values
 # Define the column names with icons
 headers_with_icons = [
     "🤖 Model Name",
@@ -27,25 +29,16 @@ headers_with_icons = [
     "🔬 Level 4",
 ]
 accuracy = {file: calculate_accuracy(data[file]) for file in data}
-# Create a list to hold the data
 data_for_df = []
-# Define the column names with icons
-# Iterate over each file and its corresponding DataFrame in the data dictionary
 for file, df in data.items():
-    # Get the overall accuracy and round it
     overall_accuracy = round(calculate_accuracy(df), 2)
-    # Get the breakdown accuracy and round each value
     breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
-    # Prepare the model name from the file name
-    model_name = file.split("/")[-1].replace(".pkl", "")  # Corrected the file extension
-    # Append the data to the list
     data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
-# Define the column names, adjust based on the number of difficulty levels you have
 column_names = [
     "Model Name",
     "Overall Accuracy",
@@ -55,34 +48,60 @@ column_names = [
     "Level 4 Accuracy",
 ]
-# Create the DataFrame
 accuracy_df = pd.DataFrame(data_for_df, columns=column_names)
 accuracy_df.columns = headers_with_icons
 accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
 def load_heatmap(evt: gr.SelectData):
     heatmap_image = gr.Image(f"results/{evt.value}.jpg")
     return heatmap_image
 with gr.Blocks() as demo:
     gr.Markdown("# FSM Benchmark Leaderboard")
-    # add link to home page and dataset
     with gr.Tab("Text-only Benchmark"):
         leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
         gr.Markdown("## Heatmap")
-        heatamp_image = gr.Image(label="", show_label=False)
-        leader_board.select(fn=load_heatmap, outputs=[heatamp_image])
     with gr.Tab("Vision Benchmark"):
-        gr.Markdown("# TBA")
-        leader_board_vision = gr.Dataframe()
         gr.Markdown("## Heatmap")
-        heatamp_image_vision = gr.Image(label="", show_label=False)
-    demo.launch()

 import pandas as pd
 from glob import glob
+# Load text benchmark results
 csv_results = glob("results/*.pkl")
+# Load vision benchmark results
+vision_results = glob("results-vision/*.pkl")
+# Load the csv files into a dict with keys being name of the file and values being the data
+data = {file: pd.read_pickle(file) for file in csv_results}
+# Load the vision files into a dict
+vision_data = {file: pd.read_pickle(file) for file in vision_results}
 def calculate_accuracy(df):
     return df["parsed_judge_response"].mean() * 100
 def accuracy_breakdown(df):
     # 4 level accuracy
     return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values
 # Define the column names with icons
 headers_with_icons = [
     "🤖 Model Name",
     "🔬 Level 4",
 ]
+# Process text benchmark data
 accuracy = {file: calculate_accuracy(data[file]) for file in data}
 data_for_df = []
 for file, df in data.items():
     overall_accuracy = round(calculate_accuracy(df), 2)
     breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
+    model_name = file.split("/")[-1].replace(".pkl", "")
     data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
 column_names = [
     "Model Name",
     "Overall Accuracy",
     "Level 4 Accuracy",
 ]
+# accuracy_df = pd.DataFrame(data_for_df, columns=column_names)
+# accuracy_df.columns = headers_with_icons
+# accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
+# After creating the DataFrame and before sorting
 accuracy_df = pd.DataFrame(data_for_df, columns=column_names)
+accuracy_df = accuracy_df.round(1)  # Round to one decimal place
+accuracy_df = accuracy_df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
 accuracy_df.columns = headers_with_icons
 accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
+# Process vision benchmark data
+vision_data_for_df = []
+for file, df in vision_data.items():
+    overall_accuracy = round(calculate_accuracy(df), 2)
+    breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
+    model_name = file.split("/")[-1].replace(".pkl", "")
+    vision_data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
+# vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
+# vision_accuracy_df.columns = headers_with_icons
+# vision_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
+# Do the same for vision_accuracy_df
+vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
+vision_accuracy_df = vision_accuracy_df.round(1)  # Round to one decimal place
+vision_accuracy_df = vision_accuracy_df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
+vision_accuracy_df.columns = headers_with_icons
+vision_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
 def load_heatmap(evt: gr.SelectData):
     heatmap_image = gr.Image(f"results/{evt.value}.jpg")
     return heatmap_image
+def load_vision_heatmap(evt: gr.SelectData):
+    heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg")
+    return heatmap_image
 with gr.Blocks() as demo:
     gr.Markdown("# FSM Benchmark Leaderboard")
     with gr.Tab("Text-only Benchmark"):
+        gr.Markdown("# Text-only Leaderboard")
         leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
         gr.Markdown("## Heatmap")
+        heatmap_image = gr.Image(label="", show_label=False)
+        leader_board.select(fn=load_heatmap, outputs=[heatmap_image])
     with gr.Tab("Vision Benchmark"):
+        gr.Markdown("# Vision Benchmark Leaderboard")
+        leader_board_vision = gr.Dataframe(vision_accuracy_df, headers=headers_with_icons)
         gr.Markdown("## Heatmap")
+        heatmap_image_vision = gr.Image(label="", show_label=False)
+        leader_board_vision.select(fn=load_vision_heatmap, outputs=[heatmap_image_vision])
+    demo.launch()

results-vision/gpt-4v-vision-preview.jpg ADDED Viewed

Git LFS Details

SHA256: aed42e95b34a548d133fbb0b557b27a1633f66620feb20e971816571591f2659
Pointer size: 132 Bytes
Size of remote file: 1.33 MB

results-vision/gpt-4v-vision-preview.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e98609b7b262836c49cdaf5d3dfc02b7037dc9fcc1f75a41d58a984015318759
+size 6363780

results-vision/gpt-4v-vision-preview.png ADDED Viewed

Git LFS Details

SHA256: 6cf5130388a71fa198ec6094e3709dbfeedb242fb5eb04e823697ad3e4636246
Pointer size: 132 Bytes
Size of remote file: 1.01 MB