Spaces:
Sleeping
Sleeping
update
Browse files- .gitattributes +4 -0
- app.py +47 -28
- results-vision/gpt-4v-vision-preview.jpg +3 -0
- results-vision/gpt-4v-vision-preview.pkl +3 -0
- results-vision/gpt-4v-vision-preview.png +3 -0
.gitattributes
CHANGED
@@ -81,3 +81,7 @@ results/Qwen1.5-72B-Chat.pkl filter=lfs diff=lfs merge=lfs -text
|
|
81 |
results/Claude-3-Opus.pkl filter=lfs diff=lfs merge=lfs -text
|
82 |
results/Claude-3-Opus.png filter=lfs diff=lfs merge=lfs -text
|
83 |
results/GPT-4-0125-preview.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
81 |
results/Claude-3-Opus.pkl filter=lfs diff=lfs merge=lfs -text
|
82 |
results/Claude-3-Opus.png filter=lfs diff=lfs merge=lfs -text
|
83 |
results/GPT-4-0125-preview.png filter=lfs diff=lfs merge=lfs -text
|
84 |
+
results/GPT-4-0125-preview.pkl filter=lfs diff=lfs merge=lfs -text
|
85 |
+
results-vision/gpt-4v-vision-preview.png filter=lfs diff=lfs merge=lfs -text
|
86 |
+
results-vision/gpt-4v-vision-preview.jpg filter=lfs diff=lfs merge=lfs -text
|
87 |
+
results-vision/gpt-4v-vision-preview.pkl filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -2,21 +2,23 @@ import gradio as gr
|
|
2 |
import pandas as pd
|
3 |
from glob import glob
|
4 |
|
5 |
-
|
6 |
csv_results = glob("results/*.pkl")
|
7 |
-
#
|
8 |
-
|
9 |
|
|
|
|
|
|
|
|
|
10 |
|
11 |
def calculate_accuracy(df):
|
12 |
return df["parsed_judge_response"].mean() * 100
|
13 |
|
14 |
-
|
15 |
def accuracy_breakdown(df):
|
16 |
# 4 level accuracy
|
17 |
return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values
|
18 |
|
19 |
-
|
20 |
# Define the column names with icons
|
21 |
headers_with_icons = [
|
22 |
"π€ Model Name",
|
@@ -27,25 +29,16 @@ headers_with_icons = [
|
|
27 |
"π¬ Level 4",
|
28 |
]
|
29 |
|
30 |
-
|
31 |
accuracy = {file: calculate_accuracy(data[file]) for file in data}
|
32 |
-
|
33 |
-
# Create a list to hold the data
|
34 |
data_for_df = []
|
35 |
-
# Define the column names with icons
|
36 |
|
37 |
-
# Iterate over each file and its corresponding DataFrame in the data dictionary
|
38 |
for file, df in data.items():
|
39 |
-
# Get the overall accuracy and round it
|
40 |
overall_accuracy = round(calculate_accuracy(df), 2)
|
41 |
-
# Get the breakdown accuracy and round each value
|
42 |
breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
|
43 |
-
|
44 |
-
model_name = file.split("/")[-1].replace(".pkl", "") # Corrected the file extension
|
45 |
-
# Append the data to the list
|
46 |
data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
|
47 |
|
48 |
-
# Define the column names, adjust based on the number of difficulty levels you have
|
49 |
column_names = [
|
50 |
"Model Name",
|
51 |
"Overall Accuracy",
|
@@ -55,34 +48,60 @@ column_names = [
|
|
55 |
"Level 4 Accuracy",
|
56 |
]
|
57 |
|
58 |
-
#
|
|
|
|
|
|
|
|
|
59 |
accuracy_df = pd.DataFrame(data_for_df, columns=column_names)
|
|
|
|
|
60 |
accuracy_df.columns = headers_with_icons
|
61 |
accuracy_df.sort_values(by="β Overall", ascending=False, inplace=True)
|
62 |
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
def load_heatmap(evt: gr.SelectData):
|
65 |
heatmap_image = gr.Image(f"results/{evt.value}.jpg")
|
66 |
return heatmap_image
|
67 |
|
|
|
|
|
|
|
68 |
|
69 |
with gr.Blocks() as demo:
|
70 |
gr.Markdown("# FSM Benchmark Leaderboard")
|
71 |
-
# add link to home page and dataset
|
72 |
with gr.Tab("Text-only Benchmark"):
|
73 |
-
|
74 |
leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
|
75 |
-
|
76 |
gr.Markdown("## Heatmap")
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
leader_board.select(fn=load_heatmap, outputs=[heatamp_image])
|
81 |
|
82 |
with gr.Tab("Vision Benchmark"):
|
83 |
-
gr.Markdown("#
|
84 |
-
leader_board_vision = gr.Dataframe()
|
85 |
gr.Markdown("## Heatmap")
|
86 |
-
|
|
|
87 |
|
88 |
-
demo.launch()
|
|
|
2 |
import pandas as pd
|
3 |
from glob import glob
|
4 |
|
5 |
+
# Load text benchmark results
|
6 |
csv_results = glob("results/*.pkl")
|
7 |
+
# Load vision benchmark results
|
8 |
+
vision_results = glob("results-vision/*.pkl")
|
9 |
|
10 |
+
# Load the csv files into a dict with keys being name of the file and values being the data
|
11 |
+
data = {file: pd.read_pickle(file) for file in csv_results}
|
12 |
+
# Load the vision files into a dict
|
13 |
+
vision_data = {file: pd.read_pickle(file) for file in vision_results}
|
14 |
|
15 |
def calculate_accuracy(df):
|
16 |
return df["parsed_judge_response"].mean() * 100
|
17 |
|
|
|
18 |
def accuracy_breakdown(df):
|
19 |
# 4 level accuracy
|
20 |
return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values
|
21 |
|
|
|
22 |
# Define the column names with icons
|
23 |
headers_with_icons = [
|
24 |
"π€ Model Name",
|
|
|
29 |
"π¬ Level 4",
|
30 |
]
|
31 |
|
32 |
+
# Process text benchmark data
|
33 |
accuracy = {file: calculate_accuracy(data[file]) for file in data}
|
|
|
|
|
34 |
data_for_df = []
|
|
|
35 |
|
|
|
36 |
for file, df in data.items():
|
|
|
37 |
overall_accuracy = round(calculate_accuracy(df), 2)
|
|
|
38 |
breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
|
39 |
+
model_name = file.split("/")[-1].replace(".pkl", "")
|
|
|
|
|
40 |
data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
|
41 |
|
|
|
42 |
column_names = [
|
43 |
"Model Name",
|
44 |
"Overall Accuracy",
|
|
|
48 |
"Level 4 Accuracy",
|
49 |
]
|
50 |
|
51 |
+
# accuracy_df = pd.DataFrame(data_for_df, columns=column_names)
|
52 |
+
# accuracy_df.columns = headers_with_icons
|
53 |
+
# accuracy_df.sort_values(by="β Overall", ascending=False, inplace=True)
|
54 |
+
|
55 |
+
# After creating the DataFrame and before sorting
|
56 |
accuracy_df = pd.DataFrame(data_for_df, columns=column_names)
|
57 |
+
accuracy_df = accuracy_df.round(1) # Round to one decimal place
|
58 |
+
accuracy_df = accuracy_df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
|
59 |
accuracy_df.columns = headers_with_icons
|
60 |
accuracy_df.sort_values(by="β Overall", ascending=False, inplace=True)
|
61 |
|
62 |
|
63 |
+
# Process vision benchmark data
|
64 |
+
vision_data_for_df = []
|
65 |
+
|
66 |
+
for file, df in vision_data.items():
|
67 |
+
overall_accuracy = round(calculate_accuracy(df), 2)
|
68 |
+
breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
|
69 |
+
model_name = file.split("/")[-1].replace(".pkl", "")
|
70 |
+
vision_data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
|
71 |
+
|
72 |
+
# vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
|
73 |
+
# vision_accuracy_df.columns = headers_with_icons
|
74 |
+
# vision_accuracy_df.sort_values(by="β Overall", ascending=False, inplace=True)
|
75 |
+
|
76 |
+
# Do the same for vision_accuracy_df
|
77 |
+
vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
|
78 |
+
vision_accuracy_df = vision_accuracy_df.round(1) # Round to one decimal place
|
79 |
+
vision_accuracy_df = vision_accuracy_df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
|
80 |
+
vision_accuracy_df.columns = headers_with_icons
|
81 |
+
vision_accuracy_df.sort_values(by="β Overall", ascending=False, inplace=True)
|
82 |
+
|
83 |
def load_heatmap(evt: gr.SelectData):
|
84 |
heatmap_image = gr.Image(f"results/{evt.value}.jpg")
|
85 |
return heatmap_image
|
86 |
|
87 |
+
def load_vision_heatmap(evt: gr.SelectData):
|
88 |
+
heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg")
|
89 |
+
return heatmap_image
|
90 |
|
91 |
with gr.Blocks() as demo:
|
92 |
gr.Markdown("# FSM Benchmark Leaderboard")
|
|
|
93 |
with gr.Tab("Text-only Benchmark"):
|
94 |
+
gr.Markdown("# Text-only Leaderboard")
|
95 |
leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
|
|
|
96 |
gr.Markdown("## Heatmap")
|
97 |
+
heatmap_image = gr.Image(label="", show_label=False)
|
98 |
+
leader_board.select(fn=load_heatmap, outputs=[heatmap_image])
|
|
|
|
|
99 |
|
100 |
with gr.Tab("Vision Benchmark"):
|
101 |
+
gr.Markdown("# Vision Benchmark Leaderboard")
|
102 |
+
leader_board_vision = gr.Dataframe(vision_accuracy_df, headers=headers_with_icons)
|
103 |
gr.Markdown("## Heatmap")
|
104 |
+
heatmap_image_vision = gr.Image(label="", show_label=False)
|
105 |
+
leader_board_vision.select(fn=load_vision_heatmap, outputs=[heatmap_image_vision])
|
106 |
|
107 |
+
demo.launch()
|
results-vision/gpt-4v-vision-preview.jpg
ADDED
![]() |
Git LFS Details
|
results-vision/gpt-4v-vision-preview.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e98609b7b262836c49cdaf5d3dfc02b7037dc9fcc1f75a41d58a984015318759
|
3 |
+
size 6363780
|
results-vision/gpt-4v-vision-preview.png
ADDED
![]() |
Git LFS Details
|