khhuiyh commited on
Commit
a80de29
·
1 Parent(s): 669c288

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +228 -0
app.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL_INFO = ["Model"]
2
+ AVGACC = "Overall Acc."
3
+ TASK_INFO = [AVGACC, "Dynamic Perception","State Transitions Perception","Camera Movement Perception","Explanatory Reasoning","Counterfactual Reasoning","Predictive Reasoning","Comparison Reasoning","Reasoning with External Knowledge","Description"]
4
+
5
+ DATA_TITILE_TYPE = ["markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
6
+ CSV_DIR = "./file/result.csv"
7
+
8
+ COLUMN_NAMES = MODEL_INFO + TASK_INFO
9
+ GT_PATH = "./file/AUTO-EVAL-VIDEO.json"
10
+ JSON_DATASET_PATH = "./file/userdata.json"
11
+ LEADERBORAD_INTRODUCTION = """# AutoEval-Video Leaderboard
12
+
13
+ Welcome to the leaderboard of AutoEval-Video!
14
+ AutoEval-Video comprises 327 complex open-ended video-question instances that span across nine skill dimensions, which address video-specific perception, comprehension, and generation skills. Please refer to our [paper]() for more details.
15
+ """
16
+
17
+ SUBMIT_INTRODUCTION = """# Submit Introduction
18
+ For example, if you want to upload GPT-4V's result in the leaderboard, you need to:
19
+ 1. Fill in 'GPT-4V' in 'Model Name' if it is your first time to submit your result. Alternatively, if you wish to modify the outcomes of your model, please add a version suffix after the model's name like 'GPT-4V_v2'.
20
+ 2. Upload results.json.
21
+ 3. Click the 'Evaluate' button.
22
+ 4. Click 'Refresh' to obtain the uploaded leaderboard.
23
+ 5. The evaluation results of your model will be given in the "Overall Acc." box. For results specific to each evaluation dimension, please refer back to the leaderboard.
24
+ """
25
+
26
+ TABLE_INTRODUCTION = """The table below shows the performance of various models on different evaluation dimensions on AutoEval-Video.
27
+ We use accuracy(%) as the primary evaluation metric for each dimension.
28
+ """
29
+
30
+ CITATION_BUTTON_LABEL = "If you find AutoEval-Video useful for your research and applications, please copy the following snippet to cite these results: "
31
+ CITATION_BUTTON_TEXT = """"""
32
+ style = """<style>
33
+ .dataframe-container {
34
+ overflow-x: auto;
35
+ }
36
+ </style>"""
37
+ import gradio as gr
38
+ import pandas as pd
39
+ import json
40
+ import time
41
+ import random
42
+ from huggingface_hub import CommitScheduler, login
43
+ import os
44
+ from openai import OpenAI
45
+ from tool import *
46
+
47
+ global data_component
48
+ login(token=os.environ.get("HF_TOKEN"), write_permission=True)
49
+
50
+
51
+ def get_result_df():
52
+ df = pd.read_csv(CSV_DIR)[COLUMN_NAMES]
53
+ df = df.sort_values(by=AVGACC, ascending=False)
54
+ return df
55
+ def prediction_analyse(prediction_content,questiontype_list):
56
+ predictions = prediction_content.split("\n")
57
+
58
+ ground_truth_data = []
59
+ with open("./file/AUTO-EVAL-VIDEO.json", "r") as f:
60
+ for line in f :
61
+ data = json.loads(line.strip())
62
+ ground_truth_data.append(data)
63
+
64
+ id2item = {str(item["ID"]): item for item in ground_truth_data}
65
+
66
+ results = {i: {"correct": 0, "total": 0} for i in questiontype_list}
67
+
68
+ for prediction in predictions:
69
+ # pdb.set_trace()
70
+ prediction = prediction.strip()
71
+ if not prediction:
72
+ continue
73
+ try:
74
+ prediction = json.loads(prediction)
75
+ except json.JSONDecodeError:
76
+ print(f"Warning: Skipping invalid JSON data in line: {prediction}")
77
+ continue
78
+ question_id = str(prediction["ID"])
79
+ item_gt = id2item[question_id]
80
+ rule = item_gt['Rule']
81
+ question_type = item_gt["Dimension"]
82
+
83
+ pre_output = prediction["prediction"]
84
+ if "judge" in list(prediction.keys()):
85
+ judge_result_bit = prediction["judge"]
86
+ else:
87
+ _, judge_result_bit = alternate_judge(rule, pre_output, os.environ.get("yuan_api"))
88
+ assert judge_result_bit in ["0", "1"], "Invalid judge result bit!"
89
+ if judge_result_bit == "1":
90
+ results[question_type]["correct"] += 1
91
+
92
+ results[question_type]["total"] += 1
93
+
94
+ return results
95
+
96
+
97
+
98
+ scheduler = CommitScheduler(
99
+ repo_id="AUTOEVAL-Video-Backup",
100
+ private=True,
101
+ repo_type="dataset",
102
+ folder_path="./file",
103
+ path_in_repo="data",
104
+ every=1,
105
+ )
106
+
107
+ def save_json(modelname, user_dict_list):
108
+ with open(JSON_DATASET_PATH, "a") as f:
109
+ json.dump({modelname:user_dict_list}, f)
110
+ f.write('\n')
111
+
112
+ def add_new_eval(
113
+ input_file,
114
+ model_name_textbox: str,
115
+ ):
116
+ if len(model_name_textbox) == 0:
117
+ return "Error! Empty model name!", get_result_df()
118
+
119
+ if input_file is None:
120
+ return "Error! Empty file!", get_result_df()
121
+ else:
122
+ csv_data = pd.read_csv(CSV_DIR, dtype={'Model': str})
123
+ model_name_list = list(csv_data['Model'])
124
+ if model_name_textbox in model_name_list:
125
+ return "In the leaderboard, there already exists a model with the same name, and duplicate submissions of it are not allowed.", get_result_df()
126
+
127
+ questiontype = ["Dynamic Perception","State Transitions Perception","Camera Movement Perception","Explanatory Reasoning","Counterfactual Reasoning","Predictive Reasoning","Comparison Reasoning","Reasoning with External Knowledge","Description"]
128
+ id2questiontype = dict(zip(range(1, 10),questiontype))
129
+ content = input_file.decode("utf-8").strip()
130
+ userdata = content.split('\n')
131
+ if len(userdata) != count_lines(GT_PATH):
132
+ return f"Error! The number of lines in the submit file ({len(userdata)}) does not match the number of lines in the AUTO-EVAL-VIDEO.json file ({count_lines(GT_PATH)}).", get_result_df()
133
+
134
+ prediction = prediction_analyse(content,questiontype)
135
+
136
+ each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) for i in questiontype}
137
+
138
+ total_correct_video = sum(prediction[i]["correct"] for i in questiontype)
139
+
140
+ total_video = sum(prediction[i]["total"] for i in questiontype)
141
+
142
+
143
+ average_accuracy_video = round(total_correct_video / total_video * 100, 1)
144
+
145
+
146
+ col = csv_data.shape[0]
147
+ new_data = [
148
+ model_name_textbox,
149
+ average_accuracy_video,
150
+ each_task_accuracy[id2questiontype[1]],
151
+ each_task_accuracy[id2questiontype[2]],
152
+ each_task_accuracy[id2questiontype[3]],
153
+ each_task_accuracy[id2questiontype[4]],
154
+ each_task_accuracy[id2questiontype[5]],
155
+ each_task_accuracy[id2questiontype[6]],
156
+ each_task_accuracy[id2questiontype[7]],
157
+ each_task_accuracy[id2questiontype[8]],
158
+ each_task_accuracy[id2questiontype[9]],
159
+ ]
160
+ csv_data.loc[col] = new_data
161
+ with scheduler.lock:
162
+ csv_data = csv_data.to_csv(CSV_DIR, index=False)
163
+ save_json(model_name_textbox, userdata)
164
+ return str(average_accuracy_video) + "%", get_result_df()
165
+
166
+
167
+ block = gr.Blocks()
168
+
169
+
170
+ with block:
171
+ gr.Markdown(
172
+ LEADERBORAD_INTRODUCTION
173
+ )
174
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
175
+ with gr.TabItem(" 🏆 AutoEval-Video Benchmark", elem_id="AutoEval-Video-tab-table", id=0):
176
+ with gr.Row():
177
+ with gr.Accordion("Citation", open=False):
178
+ citation_button = gr.Textbox(
179
+ value=CITATION_BUTTON_TEXT,
180
+ label=CITATION_BUTTON_LABEL,
181
+ interactive=False,
182
+ elem_id="citation-button",
183
+ ).style(show_copy_button=True)
184
+
185
+ gr.Markdown(
186
+ TABLE_INTRODUCTION
187
+ )
188
+
189
+ data_component = gr.components.Dataframe(
190
+ value=get_result_df,
191
+ headers=COLUMN_NAMES,
192
+ type="pandas",
193
+ datatype=DATA_TITILE_TYPE,
194
+ interactive=False,
195
+ visible=True,
196
+ css=style,
197
+ )
198
+ with gr.Row():
199
+ data_run = gr.Button("Refresh")
200
+ data_run.click(
201
+ get_result_df, outputs=data_component
202
+ )
203
+
204
+ with gr.TabItem("✨ Submit your model result here!", elem_id="AutoEval-Video-tab-table",id=1):
205
+ with gr.Row():
206
+ gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
207
+
208
+
209
+ with gr.Column():
210
+ model_name_textbox = gr.Textbox(
211
+ label="Model name"
212
+ )
213
+
214
+ with gr.Column():
215
+
216
+ input_file = gr.inputs.File(label = "Click to Upload a json File", file_count="single", type='binary')
217
+ submit_button = gr.Button("Evaluate")
218
+ overall_acc = gr.Textbox(label="Overall Acc.")
219
+
220
+ submit_button.click(
221
+ add_new_eval,
222
+ inputs = [
223
+ input_file,
224
+ model_name_textbox,
225
+ ],
226
+ outputs = [overall_acc, data_component],
227
+ )
228
+ block.launch()