Spaces:
Sleeping
Sleeping
update
Browse files
app.py
CHANGED
@@ -34,8 +34,10 @@ all_data = pd.concat(
|
|
34 |
[data, vision_data, cot_text_data, cot_vision_data], ignore_index=True
|
35 |
)
|
36 |
|
37 |
-
all_model_names = all_data[
|
38 |
-
all_text_only_model_names = list(
|
|
|
|
|
39 |
print(all_text_only_model_names)
|
40 |
|
41 |
## Continue with the cold code --
|
@@ -118,8 +120,6 @@ cot_text_accuracy_df = finalize_df(cot_text_accuracy_df)
|
|
118 |
cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df)
|
119 |
|
120 |
|
121 |
-
|
122 |
-
|
123 |
def load_heatmap(evt: gr.SelectData):
|
124 |
heatmap_image = gr.Image(f"results/{evt.value}.jpg")
|
125 |
return heatmap_image
|
@@ -142,46 +142,49 @@ def load_cot_vision_heatmap(evt: gr.SelectData):
|
|
142 |
|
143 |
def calculate_order_by_first_substring(selected_models):
|
144 |
|
145 |
-
first_columns = all_data[all_data[
|
146 |
-
query_ids_df = first_columns[first_columns[
|
147 |
-
|
148 |
-
|
149 |
-
# Filter to include only the selected models
|
150 |
-
query_ids_df = query_ids_df[query_ids_df['Model Name'].isin(selected_models)]
|
151 |
|
152 |
print(len(query_ids_df))
|
153 |
|
154 |
-
query_ids_df = query_ids_df.groupby(
|
|
|
|
|
155 |
|
156 |
print(len(query_ids_df))
|
157 |
|
158 |
query_ids = query_ids_df.query_id.unique()
|
159 |
-
# print('query_ids', len(query_ids))
|
160 |
-
|
161 |
-
# filter out fsm_ids and
|
162 |
fsm_ids = query_ids_df.fsm_id.unique()
|
163 |
-
print(
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
text_only_filtered
|
177 |
-
|
178 |
-
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
|
181 |
return text_only_filtered
|
182 |
|
183 |
|
184 |
-
|
185 |
with gr.Blocks() as demo:
|
186 |
gr.Markdown("# FSM Benchmark Leaderboard")
|
187 |
with gr.Tab("Text-only Benchmark"):
|
@@ -227,11 +230,16 @@ with gr.Blocks() as demo:
|
|
227 |
with gr.Tab("Constraint Text-only Results"):
|
228 |
gr.Markdown("## Constraint Text-only Leaderboard by first substring")
|
229 |
included_models = gr.CheckboxGroup(
|
230 |
-
label="Models to include",
|
|
|
|
|
231 |
)
|
232 |
constrained_leader_board_text = gr.Dataframe()
|
233 |
|
234 |
-
|
235 |
-
|
|
|
|
|
|
|
236 |
|
237 |
demo.launch()
|
|
|
34 |
[data, vision_data, cot_text_data, cot_vision_data], ignore_index=True
|
35 |
)
|
36 |
|
37 |
+
all_model_names = all_data["Model Name"].unique()
|
38 |
+
all_text_only_model_names = list(
|
39 |
+
all_data[all_data["Model Type"] == "Text Only"]["Model Name"].unique()
|
40 |
+
)
|
41 |
print(all_text_only_model_names)
|
42 |
|
43 |
## Continue with the cold code --
|
|
|
120 |
cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df)
|
121 |
|
122 |
|
|
|
|
|
123 |
def load_heatmap(evt: gr.SelectData):
|
124 |
heatmap_image = gr.Image(f"results/{evt.value}.jpg")
|
125 |
return heatmap_image
|
|
|
142 |
|
143 |
def calculate_order_by_first_substring(selected_models):
|
144 |
|
145 |
+
first_columns = all_data[all_data["substring_index"] == 1]
|
146 |
+
query_ids_df = first_columns[first_columns["Model Type"] == "Text Only"]
|
147 |
+
query_ids_df = query_ids_df[query_ids_df["Model Name"].isin(selected_models)]
|
|
|
|
|
|
|
148 |
|
149 |
print(len(query_ids_df))
|
150 |
|
151 |
+
query_ids_df = query_ids_df.groupby("query_id").filter(
|
152 |
+
lambda x: x["parsed_judge_response"].eq(1).all()
|
153 |
+
)
|
154 |
|
155 |
print(len(query_ids_df))
|
156 |
|
157 |
query_ids = query_ids_df.query_id.unique()
|
|
|
|
|
|
|
158 |
fsm_ids = query_ids_df.fsm_id.unique()
|
159 |
+
print(
|
160 |
+
"fsm_ids",
|
161 |
+
len(fsm_ids),
|
162 |
+
"Total of 25 FSM is solvable by everything on the first substring",
|
163 |
+
)
|
164 |
+
|
165 |
+
text_only = all_data[all_data["Model Type"] == "Text Only"]
|
166 |
+
text_only_filtered = text_only[text_only["fsm_id"].isin(fsm_ids)]
|
167 |
+
|
168 |
+
print(
|
169 |
+
f"Number of query_ids from text_only_filtered: {len(text_only_filtered.query_id.unique())}"
|
170 |
+
)
|
171 |
+
|
172 |
+
text_only_filtered = (
|
173 |
+
text_only_filtered.groupby(["Model Name"])["parsed_judge_response"]
|
174 |
+
.mean()
|
175 |
+
.reset_index()
|
176 |
+
)
|
177 |
+
text_only_filtered["Accuracy"] = text_only_filtered["parsed_judge_response"] * 100
|
178 |
+
text_only_filtered.drop("parsed_judge_response", axis=1, inplace=True)
|
179 |
+
|
180 |
+
text_only_filtered["Accuracy"] = text_only_filtered["Accuracy"].apply(
|
181 |
+
lambda x: round(x, 2)
|
182 |
+
)
|
183 |
+
text_only_filtered.sort_values("Accuracy", ascending=False, inplace=True)
|
184 |
|
185 |
return text_only_filtered
|
186 |
|
187 |
|
|
|
188 |
with gr.Blocks() as demo:
|
189 |
gr.Markdown("# FSM Benchmark Leaderboard")
|
190 |
with gr.Tab("Text-only Benchmark"):
|
|
|
230 |
with gr.Tab("Constraint Text-only Results"):
|
231 |
gr.Markdown("## Constraint Text-only Leaderboard by first substring")
|
232 |
included_models = gr.CheckboxGroup(
|
233 |
+
label="Models to include",
|
234 |
+
choices=all_text_only_model_names,
|
235 |
+
value=all_text_only_model_names,
|
236 |
)
|
237 |
constrained_leader_board_text = gr.Dataframe()
|
238 |
|
239 |
+
included_models.input(
|
240 |
+
fn=calculate_order_by_first_substring,
|
241 |
+
inputs=[included_models],
|
242 |
+
outputs=[constrained_leader_board_text],
|
243 |
+
)
|
244 |
|
245 |
demo.launch()
|