taesiri commited on
Commit
da19c93
·
1 Parent(s): c1f8e27
Files changed (1) hide show
  1. app.py +43 -35
app.py CHANGED
@@ -34,8 +34,10 @@ all_data = pd.concat(
34
  [data, vision_data, cot_text_data, cot_vision_data], ignore_index=True
35
  )
36
 
37
- all_model_names = all_data['Model Name'].unique()
38
- all_text_only_model_names = list(all_data[all_data['Model Type'] == 'Text Only']['Model Name'].unique())
 
 
39
  print(all_text_only_model_names)
40
 
41
  ## Continue with the cold code --
@@ -118,8 +120,6 @@ cot_text_accuracy_df = finalize_df(cot_text_accuracy_df)
118
  cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df)
119
 
120
 
121
-
122
-
123
  def load_heatmap(evt: gr.SelectData):
124
  heatmap_image = gr.Image(f"results/{evt.value}.jpg")
125
  return heatmap_image
@@ -142,46 +142,49 @@ def load_cot_vision_heatmap(evt: gr.SelectData):
142
 
143
  def calculate_order_by_first_substring(selected_models):
144
 
145
- first_columns = all_data[all_data['substring_index'] == 1]
146
- query_ids_df = first_columns[first_columns['Model Type'] == 'Text Only']
147
-
148
-
149
- # Filter to include only the selected models
150
- query_ids_df = query_ids_df[query_ids_df['Model Name'].isin(selected_models)]
151
 
152
  print(len(query_ids_df))
153
 
154
- query_ids_df = query_ids_df.groupby('query_id').filter(lambda x: x['parsed_judge_response'].eq(1).all())
 
 
155
 
156
  print(len(query_ids_df))
157
 
158
  query_ids = query_ids_df.query_id.unique()
159
- # print('query_ids', len(query_ids))
160
-
161
- # filter out fsm_ids and
162
  fsm_ids = query_ids_df.fsm_id.unique()
163
- print('fsm_ids', len(fsm_ids), "Total of 25 FSM is solvable by everything on the first substring")
164
-
165
-
166
- # now filter all_data for query_ids and text only, then calcaulte the accuracy based on the parsed_judge_response for each model
167
-
168
- text_only = all_data[all_data['Model Type'] == 'Text Only']
169
- text_only_filtered = text_only[text_only['fsm_id'].isin(fsm_ids)]
170
- # print # of query_ids from text_only_filtered
171
- print(f"Number of query_ids from text_only_filtered: {len(text_only_filtered.query_id.unique())}")
172
-
173
- text_only_filtered = text_only_filtered.groupby(['Model Name'])['parsed_judge_response'].mean().reset_index()
174
- text_only_filtered['Accuracy'] = text_only_filtered['parsed_judge_response'] * 100
175
- text_only_filtered.drop('parsed_judge_response', axis=1, inplace=True)
176
- text_only_filtered.sort_values('Accuracy', ascending=False)
177
-
178
- # round to two decimal places
179
- text_only_filtered['Accuracy'] = text_only_filtered['Accuracy'].apply(lambda x: round(x, 2))
 
 
 
 
 
 
 
 
180
 
181
  return text_only_filtered
182
 
183
 
184
-
185
  with gr.Blocks() as demo:
186
  gr.Markdown("# FSM Benchmark Leaderboard")
187
  with gr.Tab("Text-only Benchmark"):
@@ -227,11 +230,16 @@ with gr.Blocks() as demo:
227
  with gr.Tab("Constraint Text-only Results"):
228
  gr.Markdown("## Constraint Text-only Leaderboard by first substring")
229
  included_models = gr.CheckboxGroup(
230
- label="Models to include", choices=all_text_only_model_names, value=all_text_only_model_names
 
 
231
  )
232
  constrained_leader_board_text = gr.Dataframe()
233
 
234
-
235
- included_models.input(fn=calculate_order_by_first_substring, inputs=[included_models], outputs=[constrained_leader_board_text])
 
 
 
236
 
237
  demo.launch()
 
34
  [data, vision_data, cot_text_data, cot_vision_data], ignore_index=True
35
  )
36
 
37
+ all_model_names = all_data["Model Name"].unique()
38
+ all_text_only_model_names = list(
39
+ all_data[all_data["Model Type"] == "Text Only"]["Model Name"].unique()
40
+ )
41
  print(all_text_only_model_names)
42
 
43
  ## Continue with the cold code --
 
120
  cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df)
121
 
122
 
 
 
123
  def load_heatmap(evt: gr.SelectData):
124
  heatmap_image = gr.Image(f"results/{evt.value}.jpg")
125
  return heatmap_image
 
142
 
143
  def calculate_order_by_first_substring(selected_models):
144
 
145
+ first_columns = all_data[all_data["substring_index"] == 1]
146
+ query_ids_df = first_columns[first_columns["Model Type"] == "Text Only"]
147
+ query_ids_df = query_ids_df[query_ids_df["Model Name"].isin(selected_models)]
 
 
 
148
 
149
  print(len(query_ids_df))
150
 
151
+ query_ids_df = query_ids_df.groupby("query_id").filter(
152
+ lambda x: x["parsed_judge_response"].eq(1).all()
153
+ )
154
 
155
  print(len(query_ids_df))
156
 
157
  query_ids = query_ids_df.query_id.unique()
 
 
 
158
  fsm_ids = query_ids_df.fsm_id.unique()
159
+ print(
160
+ "fsm_ids",
161
+ len(fsm_ids),
162
+ "Total of 25 FSM is solvable by everything on the first substring",
163
+ )
164
+
165
+ text_only = all_data[all_data["Model Type"] == "Text Only"]
166
+ text_only_filtered = text_only[text_only["fsm_id"].isin(fsm_ids)]
167
+
168
+ print(
169
+ f"Number of query_ids from text_only_filtered: {len(text_only_filtered.query_id.unique())}"
170
+ )
171
+
172
+ text_only_filtered = (
173
+ text_only_filtered.groupby(["Model Name"])["parsed_judge_response"]
174
+ .mean()
175
+ .reset_index()
176
+ )
177
+ text_only_filtered["Accuracy"] = text_only_filtered["parsed_judge_response"] * 100
178
+ text_only_filtered.drop("parsed_judge_response", axis=1, inplace=True)
179
+
180
+ text_only_filtered["Accuracy"] = text_only_filtered["Accuracy"].apply(
181
+ lambda x: round(x, 2)
182
+ )
183
+ text_only_filtered.sort_values("Accuracy", ascending=False, inplace=True)
184
 
185
  return text_only_filtered
186
 
187
 
 
188
  with gr.Blocks() as demo:
189
  gr.Markdown("# FSM Benchmark Leaderboard")
190
  with gr.Tab("Text-only Benchmark"):
 
230
  with gr.Tab("Constraint Text-only Results"):
231
  gr.Markdown("## Constraint Text-only Leaderboard by first substring")
232
  included_models = gr.CheckboxGroup(
233
+ label="Models to include",
234
+ choices=all_text_only_model_names,
235
+ value=all_text_only_model_names,
236
  )
237
  constrained_leader_board_text = gr.Dataframe()
238
 
239
+ included_models.input(
240
+ fn=calculate_order_by_first_substring,
241
+ inputs=[included_models],
242
+ outputs=[constrained_leader_board_text],
243
+ )
244
 
245
  demo.launch()