taesiri commited on
Commit
e124052
Β·
1 Parent(s): 856df6f
.gitattributes CHANGED
@@ -81,3 +81,7 @@ results/Qwen1.5-72B-Chat.pkl filter=lfs diff=lfs merge=lfs -text
81
  results/Claude-3-Opus.pkl filter=lfs diff=lfs merge=lfs -text
82
  results/Claude-3-Opus.png filter=lfs diff=lfs merge=lfs -text
83
  results/GPT-4-0125-preview.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
81
  results/Claude-3-Opus.pkl filter=lfs diff=lfs merge=lfs -text
82
  results/Claude-3-Opus.png filter=lfs diff=lfs merge=lfs -text
83
  results/GPT-4-0125-preview.png filter=lfs diff=lfs merge=lfs -text
84
+ results/GPT-4-0125-preview.pkl filter=lfs diff=lfs merge=lfs -text
85
+ results-vision/gpt-4v-vision-preview.png filter=lfs diff=lfs merge=lfs -text
86
+ results-vision/gpt-4v-vision-preview.jpg filter=lfs diff=lfs merge=lfs -text
87
+ results-vision/gpt-4v-vision-preview.pkl filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -2,21 +2,23 @@ import gradio as gr
2
  import pandas as pd
3
  from glob import glob
4
 
5
-
6
  csv_results = glob("results/*.pkl")
7
- # load the csv files into a dict with keys being name of the file and values being the data
8
- data = {file: pd.read_pickle(file) for file in csv_results}
9
 
 
 
 
 
10
 
11
  def calculate_accuracy(df):
12
  return df["parsed_judge_response"].mean() * 100
13
 
14
-
15
  def accuracy_breakdown(df):
16
  # 4 level accuracy
17
  return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values
18
 
19
-
20
  # Define the column names with icons
21
  headers_with_icons = [
22
  "πŸ€– Model Name",
@@ -27,25 +29,16 @@ headers_with_icons = [
27
  "πŸ”¬ Level 4",
28
  ]
29
 
30
-
31
  accuracy = {file: calculate_accuracy(data[file]) for file in data}
32
-
33
- # Create a list to hold the data
34
  data_for_df = []
35
- # Define the column names with icons
36
 
37
- # Iterate over each file and its corresponding DataFrame in the data dictionary
38
  for file, df in data.items():
39
- # Get the overall accuracy and round it
40
  overall_accuracy = round(calculate_accuracy(df), 2)
41
- # Get the breakdown accuracy and round each value
42
  breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
43
- # Prepare the model name from the file name
44
- model_name = file.split("/")[-1].replace(".pkl", "") # Corrected the file extension
45
- # Append the data to the list
46
  data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
47
 
48
- # Define the column names, adjust based on the number of difficulty levels you have
49
  column_names = [
50
  "Model Name",
51
  "Overall Accuracy",
@@ -55,34 +48,60 @@ column_names = [
55
  "Level 4 Accuracy",
56
  ]
57
 
58
- # Create the DataFrame
 
 
 
 
59
  accuracy_df = pd.DataFrame(data_for_df, columns=column_names)
 
 
60
  accuracy_df.columns = headers_with_icons
61
  accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
62
 
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def load_heatmap(evt: gr.SelectData):
65
  heatmap_image = gr.Image(f"results/{evt.value}.jpg")
66
  return heatmap_image
67
 
 
 
 
68
 
69
  with gr.Blocks() as demo:
70
  gr.Markdown("# FSM Benchmark Leaderboard")
71
- # add link to home page and dataset
72
  with gr.Tab("Text-only Benchmark"):
73
-
74
  leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
75
-
76
  gr.Markdown("## Heatmap")
77
-
78
- heatamp_image = gr.Image(label="", show_label=False)
79
-
80
- leader_board.select(fn=load_heatmap, outputs=[heatamp_image])
81
 
82
  with gr.Tab("Vision Benchmark"):
83
- gr.Markdown("# TBA")
84
- leader_board_vision = gr.Dataframe()
85
  gr.Markdown("## Heatmap")
86
- heatamp_image_vision = gr.Image(label="", show_label=False)
 
87
 
88
- demo.launch()
 
2
  import pandas as pd
3
  from glob import glob
4
 
5
+ # Load text benchmark results
6
  csv_results = glob("results/*.pkl")
7
+ # Load vision benchmark results
8
+ vision_results = glob("results-vision/*.pkl")
9
 
10
+ # Load the csv files into a dict with keys being name of the file and values being the data
11
+ data = {file: pd.read_pickle(file) for file in csv_results}
12
+ # Load the vision files into a dict
13
+ vision_data = {file: pd.read_pickle(file) for file in vision_results}
14
 
15
  def calculate_accuracy(df):
16
  return df["parsed_judge_response"].mean() * 100
17
 
 
18
  def accuracy_breakdown(df):
19
  # 4 level accuracy
20
  return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values
21
 
 
22
  # Define the column names with icons
23
  headers_with_icons = [
24
  "πŸ€– Model Name",
 
29
  "πŸ”¬ Level 4",
30
  ]
31
 
32
+ # Process text benchmark data
33
  accuracy = {file: calculate_accuracy(data[file]) for file in data}
 
 
34
  data_for_df = []
 
35
 
 
36
  for file, df in data.items():
 
37
  overall_accuracy = round(calculate_accuracy(df), 2)
 
38
  breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
39
+ model_name = file.split("/")[-1].replace(".pkl", "")
 
 
40
  data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
41
 
 
42
  column_names = [
43
  "Model Name",
44
  "Overall Accuracy",
 
48
  "Level 4 Accuracy",
49
  ]
50
 
51
+ # accuracy_df = pd.DataFrame(data_for_df, columns=column_names)
52
+ # accuracy_df.columns = headers_with_icons
53
+ # accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
54
+
55
+ # After creating the DataFrame and before sorting
56
  accuracy_df = pd.DataFrame(data_for_df, columns=column_names)
57
+ accuracy_df = accuracy_df.round(1) # Round to one decimal place
58
+ accuracy_df = accuracy_df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
59
  accuracy_df.columns = headers_with_icons
60
  accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
61
 
62
 
63
+ # Process vision benchmark data
64
+ vision_data_for_df = []
65
+
66
+ for file, df in vision_data.items():
67
+ overall_accuracy = round(calculate_accuracy(df), 2)
68
+ breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
69
+ model_name = file.split("/")[-1].replace(".pkl", "")
70
+ vision_data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
71
+
72
+ # vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
73
+ # vision_accuracy_df.columns = headers_with_icons
74
+ # vision_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
75
+
76
+ # Do the same for vision_accuracy_df
77
+ vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
78
+ vision_accuracy_df = vision_accuracy_df.round(1) # Round to one decimal place
79
+ vision_accuracy_df = vision_accuracy_df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
80
+ vision_accuracy_df.columns = headers_with_icons
81
+ vision_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
82
+
83
  def load_heatmap(evt: gr.SelectData):
84
  heatmap_image = gr.Image(f"results/{evt.value}.jpg")
85
  return heatmap_image
86
 
87
+ def load_vision_heatmap(evt: gr.SelectData):
88
+ heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg")
89
+ return heatmap_image
90
 
91
  with gr.Blocks() as demo:
92
  gr.Markdown("# FSM Benchmark Leaderboard")
 
93
  with gr.Tab("Text-only Benchmark"):
94
+ gr.Markdown("# Text-only Leaderboard")
95
  leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
 
96
  gr.Markdown("## Heatmap")
97
+ heatmap_image = gr.Image(label="", show_label=False)
98
+ leader_board.select(fn=load_heatmap, outputs=[heatmap_image])
 
 
99
 
100
  with gr.Tab("Vision Benchmark"):
101
+ gr.Markdown("# Vision Benchmark Leaderboard")
102
+ leader_board_vision = gr.Dataframe(vision_accuracy_df, headers=headers_with_icons)
103
  gr.Markdown("## Heatmap")
104
+ heatmap_image_vision = gr.Image(label="", show_label=False)
105
+ leader_board_vision.select(fn=load_vision_heatmap, outputs=[heatmap_image_vision])
106
 
107
+ demo.launch()
results-vision/gpt-4v-vision-preview.jpg ADDED

Git LFS Details

  • SHA256: aed42e95b34a548d133fbb0b557b27a1633f66620feb20e971816571591f2659
  • Pointer size: 132 Bytes
  • Size of remote file: 1.33 MB
results-vision/gpt-4v-vision-preview.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e98609b7b262836c49cdaf5d3dfc02b7037dc9fcc1f75a41d58a984015318759
3
+ size 6363780
results-vision/gpt-4v-vision-preview.png ADDED

Git LFS Details

  • SHA256: 6cf5130388a71fa198ec6094e3709dbfeedb242fb5eb04e823697ad3e4636246
  • Pointer size: 132 Bytes
  • Size of remote file: 1.01 MB