Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL_INFO = ["Model"]
|
2 |
+
AVGACC = "Overall Acc."
|
3 |
+
TASK_INFO = [AVGACC, "Dynamic Perception","State Transitions Perception","Camera Movement Perception","Explanatory Reasoning","Counterfactual Reasoning","Predictive Reasoning","Comparison Reasoning","Reasoning with External Knowledge","Description"]
|
4 |
+
|
5 |
+
DATA_TITILE_TYPE = ["markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
|
6 |
+
CSV_DIR = "./file/result.csv"
|
7 |
+
|
8 |
+
COLUMN_NAMES = MODEL_INFO + TASK_INFO
|
9 |
+
GT_PATH = "./file/AUTO-EVAL-VIDEO.json"
|
10 |
+
JSON_DATASET_PATH = "./file/userdata.json"
|
11 |
+
LEADERBORAD_INTRODUCTION = """# AutoEval-Video Leaderboard
|
12 |
+
|
13 |
+
Welcome to the leaderboard of AutoEval-Video!
|
14 |
+
AutoEval-Video comprises 327 complex open-ended video-question instances that span across nine skill dimensions, which address video-specific perception, comprehension, and generation skills. Please refer to our [paper]() for more details.
|
15 |
+
"""
|
16 |
+
|
17 |
+
SUBMIT_INTRODUCTION = """# Submit Introduction
|
18 |
+
For example, if you want to upload GPT-4V's result in the leaderboard, you need to:
|
19 |
+
1. Fill in 'GPT-4V' in 'Model Name' if it is your first time to submit your result. Alternatively, if you wish to modify the outcomes of your model, please add a version suffix after the model's name like 'GPT-4V_v2'.
|
20 |
+
2. Upload results.json.
|
21 |
+
3. Click the 'Evaluate' button.
|
22 |
+
4. Click 'Refresh' to obtain the uploaded leaderboard.
|
23 |
+
5. The evaluation results of your model will be given in the "Overall Acc." box. For results specific to each evaluation dimension, please refer back to the leaderboard.
|
24 |
+
"""
|
25 |
+
|
26 |
+
TABLE_INTRODUCTION = """The table below shows the performance of various models on different evaluation dimensions on AutoEval-Video.
|
27 |
+
We use accuracy(%) as the primary evaluation metric for each dimension.
|
28 |
+
"""
|
29 |
+
|
30 |
+
CITATION_BUTTON_LABEL = "If you find AutoEval-Video useful for your research and applications, please copy the following snippet to cite these results: "
|
31 |
+
CITATION_BUTTON_TEXT = """"""
|
32 |
+
style = """<style>
|
33 |
+
.dataframe-container {
|
34 |
+
overflow-x: auto;
|
35 |
+
}
|
36 |
+
</style>"""
|
37 |
+
import gradio as gr
|
38 |
+
import pandas as pd
|
39 |
+
import json
|
40 |
+
import time
|
41 |
+
import random
|
42 |
+
from huggingface_hub import CommitScheduler, login
|
43 |
+
import os
|
44 |
+
from openai import OpenAI
|
45 |
+
from tool import *
|
46 |
+
|
47 |
+
global data_component
|
48 |
+
login(token=os.environ.get("HF_TOKEN"), write_permission=True)
|
49 |
+
|
50 |
+
|
51 |
+
def get_result_df():
|
52 |
+
df = pd.read_csv(CSV_DIR)[COLUMN_NAMES]
|
53 |
+
df = df.sort_values(by=AVGACC, ascending=False)
|
54 |
+
return df
|
55 |
+
def prediction_analyse(prediction_content,questiontype_list):
|
56 |
+
predictions = prediction_content.split("\n")
|
57 |
+
|
58 |
+
ground_truth_data = []
|
59 |
+
with open("./file/AUTO-EVAL-VIDEO.json", "r") as f:
|
60 |
+
for line in f :
|
61 |
+
data = json.loads(line.strip())
|
62 |
+
ground_truth_data.append(data)
|
63 |
+
|
64 |
+
id2item = {str(item["ID"]): item for item in ground_truth_data}
|
65 |
+
|
66 |
+
results = {i: {"correct": 0, "total": 0} for i in questiontype_list}
|
67 |
+
|
68 |
+
for prediction in predictions:
|
69 |
+
# pdb.set_trace()
|
70 |
+
prediction = prediction.strip()
|
71 |
+
if not prediction:
|
72 |
+
continue
|
73 |
+
try:
|
74 |
+
prediction = json.loads(prediction)
|
75 |
+
except json.JSONDecodeError:
|
76 |
+
print(f"Warning: Skipping invalid JSON data in line: {prediction}")
|
77 |
+
continue
|
78 |
+
question_id = str(prediction["ID"])
|
79 |
+
item_gt = id2item[question_id]
|
80 |
+
rule = item_gt['Rule']
|
81 |
+
question_type = item_gt["Dimension"]
|
82 |
+
|
83 |
+
pre_output = prediction["prediction"]
|
84 |
+
if "judge" in list(prediction.keys()):
|
85 |
+
judge_result_bit = prediction["judge"]
|
86 |
+
else:
|
87 |
+
_, judge_result_bit = alternate_judge(rule, pre_output, os.environ.get("yuan_api"))
|
88 |
+
assert judge_result_bit in ["0", "1"], "Invalid judge result bit!"
|
89 |
+
if judge_result_bit == "1":
|
90 |
+
results[question_type]["correct"] += 1
|
91 |
+
|
92 |
+
results[question_type]["total"] += 1
|
93 |
+
|
94 |
+
return results
|
95 |
+
|
96 |
+
|
97 |
+
|
98 |
+
scheduler = CommitScheduler(
|
99 |
+
repo_id="AUTOEVAL-Video-Backup",
|
100 |
+
private=True,
|
101 |
+
repo_type="dataset",
|
102 |
+
folder_path="./file",
|
103 |
+
path_in_repo="data",
|
104 |
+
every=1,
|
105 |
+
)
|
106 |
+
|
107 |
+
def save_json(modelname, user_dict_list):
|
108 |
+
with open(JSON_DATASET_PATH, "a") as f:
|
109 |
+
json.dump({modelname:user_dict_list}, f)
|
110 |
+
f.write('\n')
|
111 |
+
|
112 |
+
def add_new_eval(
|
113 |
+
input_file,
|
114 |
+
model_name_textbox: str,
|
115 |
+
):
|
116 |
+
if len(model_name_textbox) == 0:
|
117 |
+
return "Error! Empty model name!", get_result_df()
|
118 |
+
|
119 |
+
if input_file is None:
|
120 |
+
return "Error! Empty file!", get_result_df()
|
121 |
+
else:
|
122 |
+
csv_data = pd.read_csv(CSV_DIR, dtype={'Model': str})
|
123 |
+
model_name_list = list(csv_data['Model'])
|
124 |
+
if model_name_textbox in model_name_list:
|
125 |
+
return "In the leaderboard, there already exists a model with the same name, and duplicate submissions of it are not allowed.", get_result_df()
|
126 |
+
|
127 |
+
questiontype = ["Dynamic Perception","State Transitions Perception","Camera Movement Perception","Explanatory Reasoning","Counterfactual Reasoning","Predictive Reasoning","Comparison Reasoning","Reasoning with External Knowledge","Description"]
|
128 |
+
id2questiontype = dict(zip(range(1, 10),questiontype))
|
129 |
+
content = input_file.decode("utf-8").strip()
|
130 |
+
userdata = content.split('\n')
|
131 |
+
if len(userdata) != count_lines(GT_PATH):
|
132 |
+
return f"Error! The number of lines in the submit file ({len(userdata)}) does not match the number of lines in the AUTO-EVAL-VIDEO.json file ({count_lines(GT_PATH)}).", get_result_df()
|
133 |
+
|
134 |
+
prediction = prediction_analyse(content,questiontype)
|
135 |
+
|
136 |
+
each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) for i in questiontype}
|
137 |
+
|
138 |
+
total_correct_video = sum(prediction[i]["correct"] for i in questiontype)
|
139 |
+
|
140 |
+
total_video = sum(prediction[i]["total"] for i in questiontype)
|
141 |
+
|
142 |
+
|
143 |
+
average_accuracy_video = round(total_correct_video / total_video * 100, 1)
|
144 |
+
|
145 |
+
|
146 |
+
col = csv_data.shape[0]
|
147 |
+
new_data = [
|
148 |
+
model_name_textbox,
|
149 |
+
average_accuracy_video,
|
150 |
+
each_task_accuracy[id2questiontype[1]],
|
151 |
+
each_task_accuracy[id2questiontype[2]],
|
152 |
+
each_task_accuracy[id2questiontype[3]],
|
153 |
+
each_task_accuracy[id2questiontype[4]],
|
154 |
+
each_task_accuracy[id2questiontype[5]],
|
155 |
+
each_task_accuracy[id2questiontype[6]],
|
156 |
+
each_task_accuracy[id2questiontype[7]],
|
157 |
+
each_task_accuracy[id2questiontype[8]],
|
158 |
+
each_task_accuracy[id2questiontype[9]],
|
159 |
+
]
|
160 |
+
csv_data.loc[col] = new_data
|
161 |
+
with scheduler.lock:
|
162 |
+
csv_data = csv_data.to_csv(CSV_DIR, index=False)
|
163 |
+
save_json(model_name_textbox, userdata)
|
164 |
+
return str(average_accuracy_video) + "%", get_result_df()
|
165 |
+
|
166 |
+
|
167 |
+
block = gr.Blocks()
|
168 |
+
|
169 |
+
|
170 |
+
with block:
|
171 |
+
gr.Markdown(
|
172 |
+
LEADERBORAD_INTRODUCTION
|
173 |
+
)
|
174 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
175 |
+
with gr.TabItem(" 🏆 AutoEval-Video Benchmark", elem_id="AutoEval-Video-tab-table", id=0):
|
176 |
+
with gr.Row():
|
177 |
+
with gr.Accordion("Citation", open=False):
|
178 |
+
citation_button = gr.Textbox(
|
179 |
+
value=CITATION_BUTTON_TEXT,
|
180 |
+
label=CITATION_BUTTON_LABEL,
|
181 |
+
interactive=False,
|
182 |
+
elem_id="citation-button",
|
183 |
+
).style(show_copy_button=True)
|
184 |
+
|
185 |
+
gr.Markdown(
|
186 |
+
TABLE_INTRODUCTION
|
187 |
+
)
|
188 |
+
|
189 |
+
data_component = gr.components.Dataframe(
|
190 |
+
value=get_result_df,
|
191 |
+
headers=COLUMN_NAMES,
|
192 |
+
type="pandas",
|
193 |
+
datatype=DATA_TITILE_TYPE,
|
194 |
+
interactive=False,
|
195 |
+
visible=True,
|
196 |
+
css=style,
|
197 |
+
)
|
198 |
+
with gr.Row():
|
199 |
+
data_run = gr.Button("Refresh")
|
200 |
+
data_run.click(
|
201 |
+
get_result_df, outputs=data_component
|
202 |
+
)
|
203 |
+
|
204 |
+
with gr.TabItem("✨ Submit your model result here!", elem_id="AutoEval-Video-tab-table",id=1):
|
205 |
+
with gr.Row():
|
206 |
+
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
|
207 |
+
|
208 |
+
|
209 |
+
with gr.Column():
|
210 |
+
model_name_textbox = gr.Textbox(
|
211 |
+
label="Model name"
|
212 |
+
)
|
213 |
+
|
214 |
+
with gr.Column():
|
215 |
+
|
216 |
+
input_file = gr.inputs.File(label = "Click to Upload a json File", file_count="single", type='binary')
|
217 |
+
submit_button = gr.Button("Evaluate")
|
218 |
+
overall_acc = gr.Textbox(label="Overall Acc.")
|
219 |
+
|
220 |
+
submit_button.click(
|
221 |
+
add_new_eval,
|
222 |
+
inputs = [
|
223 |
+
input_file,
|
224 |
+
model_name_textbox,
|
225 |
+
],
|
226 |
+
outputs = [overall_acc, data_component],
|
227 |
+
)
|
228 |
+
block.launch()
|