Spaces:

FSMBench
/

Leaderboard

Sleeping

App Files Files Community

taesiri commited on Apr 13, 2024

Commit

da19c93

1 Parent(s): c1f8e27

update

Browse files

Files changed (1) hide show

app.py +43 -35

app.py CHANGED Viewed

@@ -34,8 +34,10 @@ all_data = pd.concat(
     [data, vision_data, cot_text_data, cot_vision_data], ignore_index=True
 )
-all_model_names = all_data['Model Name'].unique()
-all_text_only_model_names = list(all_data[all_data['Model Type'] == 'Text Only']['Model Name'].unique())
 print(all_text_only_model_names)
 ## Continue with the cold code --
@@ -118,8 +120,6 @@ cot_text_accuracy_df = finalize_df(cot_text_accuracy_df)
 cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df)
 def load_heatmap(evt: gr.SelectData):
     heatmap_image = gr.Image(f"results/{evt.value}.jpg")
     return heatmap_image
@@ -142,46 +142,49 @@ def load_cot_vision_heatmap(evt: gr.SelectData):
 def calculate_order_by_first_substring(selected_models):
-    first_columns = all_data[all_data['substring_index'] == 1]
-    query_ids_df = first_columns[first_columns['Model Type'] == 'Text Only']
-    # Filter to include only the selected models
-    query_ids_df = query_ids_df[query_ids_df['Model Name'].isin(selected_models)]
     print(len(query_ids_df))
-    query_ids_df = query_ids_df.groupby('query_id').filter(lambda x: x['parsed_judge_response'].eq(1).all())
     print(len(query_ids_df))
     query_ids = query_ids_df.query_id.unique()
-    # print('query_ids', len(query_ids))
-    # filter out fsm_ids and
     fsm_ids = query_ids_df.fsm_id.unique()
-    print('fsm_ids', len(fsm_ids), "Total of 25 FSM is solvable by everything on the first substring")
-    # now filter all_data for query_ids and text only, then calcaulte the accuracy based on the parsed_judge_response for each model
-    text_only = all_data[all_data['Model Type'] == 'Text Only']
-    text_only_filtered = text_only[text_only['fsm_id'].isin(fsm_ids)]
-    # print # of query_ids from text_only_filtered
-    print(f"Number of query_ids from text_only_filtered: {len(text_only_filtered.query_id.unique())}")
-    text_only_filtered = text_only_filtered.groupby(['Model Name'])['parsed_judge_response'].mean().reset_index()
-    text_only_filtered['Accuracy'] = text_only_filtered['parsed_judge_response'] * 100
-    text_only_filtered.drop('parsed_judge_response', axis=1, inplace=True)
-    text_only_filtered.sort_values('Accuracy', ascending=False)
-    # round to two decimal places
-    text_only_filtered['Accuracy'] = text_only_filtered['Accuracy'].apply(lambda x: round(x, 2))
     return text_only_filtered
 with gr.Blocks() as demo:
     gr.Markdown("# FSM Benchmark Leaderboard")
     with gr.Tab("Text-only Benchmark"):
@@ -227,11 +230,16 @@ with gr.Blocks() as demo:
     with gr.Tab("Constraint Text-only Results"):
         gr.Markdown("## Constraint Text-only Leaderboard by first substring")
         included_models = gr.CheckboxGroup(
-            label="Models to include", choices=all_text_only_model_names, value=all_text_only_model_names
         )
         constrained_leader_board_text = gr.Dataframe()
-    included_models.input(fn=calculate_order_by_first_substring, inputs=[included_models], outputs=[constrained_leader_board_text])
     demo.launch()

     [data, vision_data, cot_text_data, cot_vision_data], ignore_index=True
 )
+all_model_names = all_data["Model Name"].unique()
+all_text_only_model_names = list(
+    all_data[all_data["Model Type"] == "Text Only"]["Model Name"].unique()
+)
 print(all_text_only_model_names)
 ## Continue with the cold code --
 cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df)
 def load_heatmap(evt: gr.SelectData):
     heatmap_image = gr.Image(f"results/{evt.value}.jpg")
     return heatmap_image
 def calculate_order_by_first_substring(selected_models):
+    first_columns = all_data[all_data["substring_index"] == 1]
+    query_ids_df = first_columns[first_columns["Model Type"] == "Text Only"]
+    query_ids_df = query_ids_df[query_ids_df["Model Name"].isin(selected_models)]
     print(len(query_ids_df))
+    query_ids_df = query_ids_df.groupby("query_id").filter(
+        lambda x: x["parsed_judge_response"].eq(1).all()
+    )
     print(len(query_ids_df))
     query_ids = query_ids_df.query_id.unique()
     fsm_ids = query_ids_df.fsm_id.unique()
+    print(
+        "fsm_ids",
+        len(fsm_ids),
+        "Total of 25 FSM is solvable by everything on the first substring",
+    )
+    text_only = all_data[all_data["Model Type"] == "Text Only"]
+    text_only_filtered = text_only[text_only["fsm_id"].isin(fsm_ids)]
+    print(
+        f"Number of query_ids from text_only_filtered: {len(text_only_filtered.query_id.unique())}"
+    )
+    text_only_filtered = (
+        text_only_filtered.groupby(["Model Name"])["parsed_judge_response"]
+        .mean()
+        .reset_index()
+    )
+    text_only_filtered["Accuracy"] = text_only_filtered["parsed_judge_response"] * 100
+    text_only_filtered.drop("parsed_judge_response", axis=1, inplace=True)
+    text_only_filtered["Accuracy"] = text_only_filtered["Accuracy"].apply(
+        lambda x: round(x, 2)
+    )
+    text_only_filtered.sort_values("Accuracy", ascending=False, inplace=True)
     return text_only_filtered
 with gr.Blocks() as demo:
     gr.Markdown("# FSM Benchmark Leaderboard")
     with gr.Tab("Text-only Benchmark"):
     with gr.Tab("Constraint Text-only Results"):
         gr.Markdown("## Constraint Text-only Leaderboard by first substring")
         included_models = gr.CheckboxGroup(
+            label="Models to include",
+            choices=all_text_only_model_names,
+            value=all_text_only_model_names,
         )
         constrained_leader_board_text = gr.Dataframe()
+    included_models.input(
+        fn=calculate_order_by_first_substring,
+        inputs=[included_models],
+        outputs=[constrained_leader_board_text],
+    )
     demo.launch()