Spaces:
Sleeping
Sleeping
update
Browse files
app.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
from glob import glob
|
4 |
+
|
5 |
+
|
6 |
+
csv_results = glob("results/*.csv")
|
7 |
+
# load the csv files into a dict with keys being name of the file and values being the data
|
8 |
+
data = {file: pd.read_csv(file) for file in csv_results}
|
9 |
+
|
10 |
+
|
11 |
+
def calculate_accuracy(df):
|
12 |
+
return df["parsed_judge_response"].mean() * 100
|
13 |
+
|
14 |
+
|
15 |
+
def accuracy_breakdown(df):
|
16 |
+
# 4 level accuracy
|
17 |
+
return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values
|
18 |
+
|
19 |
+
|
20 |
+
# Define the column names with icons
|
21 |
+
headers_with_icons = [
|
22 |
+
"π€ Model Name",
|
23 |
+
"β Overall",
|
24 |
+
"π Level 1",
|
25 |
+
"π Level 2",
|
26 |
+
"π Level 3",
|
27 |
+
"π¬ Level 4",
|
28 |
+
]
|
29 |
+
|
30 |
+
|
31 |
+
accuracy = {file: calculate_accuracy(data[file]) for file in data}
|
32 |
+
|
33 |
+
# Create a list to hold the data
|
34 |
+
data_for_df = []
|
35 |
+
# Define the column names with icons
|
36 |
+
|
37 |
+
# Iterate over each file and its corresponding DataFrame in the data dictionary
|
38 |
+
for file, df in data.items():
|
39 |
+
# Get the overall accuracy and round it
|
40 |
+
overall_accuracy = round(calculate_accuracy(df), 2)
|
41 |
+
# Get the breakdown accuracy and round each value
|
42 |
+
breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
|
43 |
+
# Prepare the model name from the file name
|
44 |
+
model_name = file.split("/")[-1].replace(".csv", "") # Corrected the file extension
|
45 |
+
# Append the data to the list
|
46 |
+
data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
|
47 |
+
|
48 |
+
# Define the column names, adjust based on the number of difficulty levels you have
|
49 |
+
column_names = [
|
50 |
+
"Model Name",
|
51 |
+
"Overall Accuracy",
|
52 |
+
"Level 1 Accuracy",
|
53 |
+
"Level 2 Accuracy",
|
54 |
+
"Level 3 Accuracy",
|
55 |
+
"Level 4 Accuracy",
|
56 |
+
]
|
57 |
+
|
58 |
+
# Create the DataFrame
|
59 |
+
accuracy_df = pd.DataFrame(data_for_df, columns=column_names)
|
60 |
+
accuracy_df.columns = headers_with_icons
|
61 |
+
accuracy_df.sort_values(by="β Overall", ascending=False, inplace=True)
|
62 |
+
|
63 |
+
|
64 |
+
with gr.Blocks() as demo:
|
65 |
+
gr.Markdown("# FSMBench Leaderboard")
|
66 |
+
# add link to home page and dataset
|
67 |
+
|
68 |
+
leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
|
69 |
+
|
70 |
+
demo.launch()
|