Spaces:
Runtime error
Runtime error
MODEL_INFO = ["Model"] | |
AVGACC = "Overall Acc." | |
TASK_INFO = [AVGACC, "Dynamic Perception","State Transitions Perception","Comparison Reasoning","Reasoning with External Knowledge","Explanatory Reasoning","Predictive Reasoning","Description","Counterfactual Reasoning","Camera Movement Perception"] | |
DATA_TITILE_TYPE = ["markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"] | |
CSV_DIR = "./file/result.csv" | |
COLUMN_NAMES = MODEL_INFO + TASK_INFO | |
GT_PATH = "./file/AUTO-EVAL-VIDEO.json" | |
JSON_DATASET_PATH = "./file/userdata.json" | |
LEADERBORAD_INTRODUCTION = """# AutoEval-Video Leaderboard | |
Welcome to the leaderboard of AutoEval-Video! | |
AutoEval-Video comprises 327 complex open-ended video-question instances that span across nine skill dimensions, which address video-specific perception, comprehension, and generation skills. Please refer to our [paper](https://arxiv.org/abs/2311.14906) for more details. | |
""" | |
SUBMIT_INTRODUCTION = """# Submit Introduction | |
1. Format your model output as a JSON file, following the example provided in our [GitHub repository](https://github.com/Xiuyuan-Chen/AutoEval-Video/blob/main/prediction_sample.json). | |
2. Assign a unique "model name" for your results. | |
3. Include the link to your model's repository with each submission. | |
4. After clicking "Evaluate", allow approximately one hour for your model's results to be processed. To view the most recent results in the leaderboard, click "Refresh". | |
""" | |
# SUBMIT_INTRODUCTION = """# Submit Introduction | |
# For example, if you want to upload GPT-4V's result in the leaderboard, you need to: | |
# 1. Fill in 'GPT-4V' in 'Model Name' if it is your first time to submit your result. Alternatively, if you wish to modify the outcomes of your model, please add a version suffix after the model's name like 'GPT-4V_v2'. | |
# 2. Upload results.json. | |
# 3. Click the 'Evaluate' button. | |
# 4. Click 'Refresh' to obtain the uploaded leaderboard. | |
# 5. The evaluation results of your model will be given in the "Overall Acc." box. For results specific to each evaluation dimension, please refer back to the leaderboard. | |
# """ | |
TABLE_INTRODUCTION = """The table below shows the performance of various models on different evaluation dimensions on AutoEval-Video. | |
We use accuracy(%) as the primary evaluation metric for each dimension. | |
""" | |
CITATION_BUTTON_LABEL = "If you find AutoEval-Video useful for your research and applications, please copy the following snippet to cite these results: " | |
CITATION_BUTTON_TEXT = """@article{chen2023autoevalvideo, | |
title={AutoEval-Video: An Automatic Benchmark for Assessing Large Vision Language Models in Open-Ended Video Question Answering}, | |
author={Xiuyuan Chen and Yuan Lin and Yuchen Zhang and Weiran Huang}, | |
year={2023}, | |
journal={arXiv preprint arXiv:2311.14906} | |
}""" | |
style = """<style> | |
.dataframe-container { | |
overflow-x: auto; | |
} | |
</style>""" | |
import gradio as gr | |
import pandas as pd | |
import json | |
from tqdm import tqdm | |
import time | |
import random | |
from huggingface_hub import CommitScheduler, login | |
import os | |
from openai import OpenAI | |
from tool import * | |
global data_component | |
login(token=os.environ.get("HF_TOKEN"), write_permission=True) | |
def get_result_df(): | |
df = pd.read_csv(CSV_DIR)[COLUMN_NAMES] | |
df = df.sort_values(by=AVGACC, ascending=False) | |
return df | |
def check_json(prediction_content): | |
predictions = prediction_content.split("\n") | |
for prediction in predictions: | |
try: | |
prediction = json.loads(prediction) | |
except json.JSONDecodeError: | |
print(f"Warning: Skipping invalid JSON data in line: {prediction}") | |
return False | |
return True | |
def prediction_analyse(prediction_content,questiontype_list): | |
predictions = prediction_content.split("\n") | |
ground_truth_data = [] | |
with open("./file/AUTO-EVAL-VIDEO.json", "r") as f: | |
for line in f : | |
data = json.loads(line.strip()) | |
ground_truth_data.append(data) | |
id2item = {str(item["ID"]): item for item in ground_truth_data} | |
results = {i: {"correct": 0, "total": 0} for i in questiontype_list} | |
for prediction in tqdm(predictions): | |
# pdb.set_trace() | |
prediction = prediction.strip() | |
if not prediction: | |
continue | |
try: | |
prediction = json.loads(prediction) | |
except json.JSONDecodeError: | |
print(f"Warning: Skipping invalid JSON data in line: {prediction}") | |
continue | |
question_id = str(prediction["ID"]) | |
print("Evaluating ID: " + question_id) | |
item_gt = id2item[question_id] | |
rule = item_gt['Rule'] | |
question_type = item_gt["Dimension"] | |
pre_output = prediction["prediction"] | |
if "judge" in list(prediction.keys()): | |
judge_result_bit = prediction["judge"] | |
else: | |
_, judge_result_bit = alternate_judge(rule, pre_output, os.environ.get("yuan_api")) | |
assert judge_result_bit in ["0", "1"], "Invalid judge result bit!" | |
if judge_result_bit == "1": | |
results[question_type]["correct"] += 1 | |
results[question_type]["total"] += 1 | |
return results | |
scheduler = CommitScheduler( | |
repo_id="AUTOEVAL-Video-Backup", | |
private=True, | |
repo_type="dataset", | |
folder_path="./file", | |
path_in_repo="data", | |
every=5, | |
) | |
def save_json(modelname, user_dict_list): | |
with open(JSON_DATASET_PATH, "a") as f: | |
json.dump({modelname:user_dict_list}, f) | |
f.write('\n') | |
def add_new_eval( | |
input_file, | |
model_name_textbox: str, | |
model_link: str | |
): | |
if len(model_name_textbox) == 0: | |
return "Error! Empty model name!", get_result_df() | |
if len(model_link) == 0: | |
return "Error! Empty model link!", get_result_df() | |
if input_file is None: | |
return "Error! Empty file!", get_result_df() | |
else: | |
csv_data = pd.read_csv(CSV_DIR, dtype={'Model': str}) | |
model_name_list = list(csv_data['Model']) | |
model_name_list = [name.split(']')[0][1:] for name in model_name_list] | |
if model_name_textbox in model_name_list: | |
return "In the leaderboard, there already exists a model with the same name, and duplicate submissions of it are not allowed.", get_result_df() | |
questiontype = COLUMN_NAMES[-9:] | |
id2questiontype = dict(zip(range(1, 10),questiontype)) | |
content = input_file.decode("utf-8").strip() | |
userdata = content.split('\n') | |
if len(userdata) != count_lines(GT_PATH): | |
return f"Error! The number of lines in the submit file ({len(userdata)}) does not match the number of lines in the AUTO-EVAL-VIDEO.json file ({count_lines(GT_PATH)}).", get_result_df() | |
if not check_json(content): | |
return "JSON DECODE ERROR!", get_result_df() | |
prediction = prediction_analyse(content,questiontype) | |
each_task_accuracy = {i: round(prediction[i]["correct"] / max(1, prediction[i]["total"]) * 100, 1) for i in questiontype} | |
total_correct_video = sum(prediction[i]["correct"] for i in questiontype) | |
total_video = sum(prediction[i]["total"] for i in questiontype) | |
average_accuracy_video = round(total_correct_video / max(1, total_video) * 100, 1) | |
col = csv_data.shape[0] | |
new_data = [ | |
'[' + model_name_textbox + '](' + model_link + ')', | |
average_accuracy_video, | |
each_task_accuracy[id2questiontype[1]], | |
each_task_accuracy[id2questiontype[2]], | |
each_task_accuracy[id2questiontype[3]], | |
each_task_accuracy[id2questiontype[4]], | |
each_task_accuracy[id2questiontype[5]], | |
each_task_accuracy[id2questiontype[6]], | |
each_task_accuracy[id2questiontype[7]], | |
each_task_accuracy[id2questiontype[8]], | |
each_task_accuracy[id2questiontype[9]], | |
] | |
csv_data.loc[col] = new_data | |
with scheduler.lock: | |
csv_data = csv_data.to_csv(CSV_DIR, index=False) | |
save_json(model_name_textbox, userdata) | |
return str(average_accuracy_video) + "%", get_result_df() | |
block = gr.Blocks() | |
with block: | |
gr.Markdown( | |
LEADERBORAD_INTRODUCTION | |
) | |
with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
with gr.TabItem(" π AutoEval-Video Benchmark", elem_id="AutoEval-Video-tab-table", id=0): | |
with gr.Row(): | |
with gr.Accordion("Citation", open=False): | |
# citation_button = gr.interface.inputs.Textbox( | |
# value=CITATION_BUTTON_TEXT, | |
# label=CITATION_BUTTON_LABEL, | |
# interactive=False, | |
# show_copy_button=True, | |
# elem_id="citation-button", | |
# ) | |
citation_button = gr.Textbox( | |
value=CITATION_BUTTON_TEXT, | |
label=CITATION_BUTTON_LABEL, | |
interactive=False, | |
elem_id="citation-button", | |
show_copy_button=True | |
) | |
# citation_button = gr.Textbox( | |
# value=CITATION_BUTTON_TEXT, | |
# label=CITATION_BUTTON_LABEL, | |
# interactive=False, | |
# elem_id="citation-button", | |
# ).style(show_copy_button=True) | |
gr.Markdown( | |
TABLE_INTRODUCTION | |
) | |
data_component = gr.components.Dataframe( | |
value=get_result_df, | |
headers=COLUMN_NAMES, | |
type="pandas", | |
datatype=DATA_TITILE_TYPE, | |
interactive=False, | |
visible=True, | |
# css=style, | |
) | |
with gr.Row(): | |
data_run = gr.Button("Refresh") | |
data_run.click( | |
get_result_df, outputs=data_component | |
) | |
with gr.TabItem("β¨ Submit your model result here!", elem_id="AutoEval-Video-tab-table",id=1): | |
with gr.Row(): | |
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text") | |
with gr.Row(): | |
with gr.Column(): | |
model_name_textbox = gr.Textbox( | |
label="Model name" | |
) | |
with gr.Column(): | |
model_link = gr.Textbox( | |
label="Model Link" | |
) | |
with gr.Column(): | |
input_file = gr.inputs.File(label = "Click to Upload a json File", file_count="single", type='binary') | |
submit_button = gr.Button("Evaluate") | |
overall_acc = gr.Textbox(label="Overall Acc.") | |
submit_button.click( | |
add_new_eval, | |
inputs = [ | |
input_file, | |
model_name_textbox, | |
model_link, | |
], | |
outputs = [overall_acc, data_component], | |
) | |
block.launch() |