Spaces:

khhuiyh
/

AutoEval-Video_LeaderBoard

Runtime error

App Files Files Community

khhuiyh commited on Nov 28, 2023

Commit

a80de29

1 Parent(s): 669c288

Create app.py

Browse files

Files changed (1) hide show

app.py +228 -0

app.py ADDED Viewed

	@@ -0,0 +1,228 @@

+MODEL_INFO = ["Model"]
+AVGACC = "Overall Acc."
+TASK_INFO = [AVGACC, "Dynamic Perception","State Transitions Perception","Camera Movement Perception","Explanatory Reasoning","Counterfactual Reasoning","Predictive Reasoning","Comparison Reasoning","Reasoning with External Knowledge","Description"]
+DATA_TITILE_TYPE = ["markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
+CSV_DIR = "./file/result.csv"
+COLUMN_NAMES = MODEL_INFO + TASK_INFO
+GT_PATH = "./file/AUTO-EVAL-VIDEO.json"
+JSON_DATASET_PATH = "./file/userdata.json"
+LEADERBORAD_INTRODUCTION = """# AutoEval-Video Leaderboard
+    Welcome to the leaderboard of AutoEval-Video!
+    AutoEval-Video comprises 327 complex open-ended video-question instances that span across nine skill dimensions, which address video-specific perception, comprehension, and generation skills. Please refer to our [paper]() for more details.
+    """
+SUBMIT_INTRODUCTION = """# Submit Introduction
+    For example, if you want to upload GPT-4V's result in the leaderboard, you need to:
+    1. Fill in 'GPT-4V' in 'Model Name' if it is your first time to submit your result. Alternatively, if you wish to modify the outcomes of your model, please add a version suffix after the model's name like 'GPT-4V_v2'.
+    2. Upload results.json.
+    3. Click the 'Evaluate' button.
+    4. Click 'Refresh' to obtain the uploaded leaderboard.
+    5. The evaluation results of your model will be given in the "Overall Acc." box. For results specific to each evaluation dimension, please refer back to the leaderboard.
+"""
+TABLE_INTRODUCTION = """The table below shows the performance of various models on different evaluation dimensions on AutoEval-Video.
+        We use accuracy(%) as the primary evaluation metric for each dimension.
+    """
+CITATION_BUTTON_LABEL = "If you find AutoEval-Video useful for your research and applications, please copy the following snippet to cite these results: "
+CITATION_BUTTON_TEXT = """"""
+style = """<style>
+    .dataframe-container {
+        overflow-x: auto;
+    }
+</style>"""
+import gradio as gr
+import pandas as pd
+import json
+import time
+import random
+from huggingface_hub import CommitScheduler, login
+import os
+from openai import OpenAI
+from tool import *
+global data_component
+login(token=os.environ.get("HF_TOKEN"), write_permission=True)
+def get_result_df():
+    df = pd.read_csv(CSV_DIR)[COLUMN_NAMES]
+    df = df.sort_values(by=AVGACC, ascending=False)
+    return df
+def prediction_analyse(prediction_content,questiontype_list):
+    predictions = prediction_content.split("\n")
+    ground_truth_data = []
+    with open("./file/AUTO-EVAL-VIDEO.json", "r") as f:
+        for line in f :
+            data = json.loads(line.strip())
+            ground_truth_data.append(data)
+    id2item = {str(item["ID"]): item for item in ground_truth_data}
+    results = {i: {"correct": 0, "total": 0} for i in questiontype_list}
+    for prediction in predictions:
+        # pdb.set_trace()
+        prediction = prediction.strip()
+        if not prediction:
+            continue
+        try:
+            prediction = json.loads(prediction)
+        except json.JSONDecodeError:
+            print(f"Warning: Skipping invalid JSON data in line: {prediction}")
+            continue
+        question_id = str(prediction["ID"])
+        item_gt = id2item[question_id]
+        rule = item_gt['Rule']
+        question_type = item_gt["Dimension"]
+        pre_output = prediction["prediction"]
+        if "judge" in list(prediction.keys()):
+            judge_result_bit = prediction["judge"]
+        else:
+            _, judge_result_bit = alternate_judge(rule, pre_output, os.environ.get("yuan_api"))
+        assert judge_result_bit in ["0", "1"], "Invalid judge result bit!"
+        if judge_result_bit == "1":
+            results[question_type]["correct"] += 1
+        results[question_type]["total"] += 1
+    return results
+scheduler = CommitScheduler(
+    repo_id="AUTOEVAL-Video-Backup",
+    private=True,
+    repo_type="dataset",
+    folder_path="./file",
+    path_in_repo="data",
+    every=1,
+)
+def save_json(modelname, user_dict_list):
+    with open(JSON_DATASET_PATH, "a") as f:
+        json.dump({modelname:user_dict_list}, f)
+        f.write('\n')
+def add_new_eval(
+    input_file,
+    model_name_textbox: str,
+):
+    if len(model_name_textbox) == 0:
+        return "Error! Empty model name!", get_result_df()
+    if input_file is None:
+        return "Error! Empty file!", get_result_df()
+    else:
+        csv_data = pd.read_csv(CSV_DIR, dtype={'Model': str})
+        model_name_list = list(csv_data['Model'])
+        if model_name_textbox in model_name_list:
+            return "In the leaderboard, there already exists a model with the same name, and duplicate submissions of it are not allowed.", get_result_df()
+        questiontype = ["Dynamic Perception","State Transitions Perception","Camera Movement Perception","Explanatory Reasoning","Counterfactual Reasoning","Predictive Reasoning","Comparison Reasoning","Reasoning with External Knowledge","Description"]
+        id2questiontype = dict(zip(range(1, 10),questiontype))
+        content = input_file.decode("utf-8").strip()
+        userdata = content.split('\n')
+        if len(userdata) != count_lines(GT_PATH):
+            return f"Error! The number of lines in the submit file ({len(userdata)}) does not match the number of lines in the AUTO-EVAL-VIDEO.json file ({count_lines(GT_PATH)}).", get_result_df()
+        prediction = prediction_analyse(content,questiontype)
+        each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) for i in questiontype}
+        total_correct_video = sum(prediction[i]["correct"] for i in questiontype)
+        total_video = sum(prediction[i]["total"] for i in questiontype)
+        average_accuracy_video = round(total_correct_video / total_video * 100, 1)
+        col = csv_data.shape[0]
+        new_data = [
+            model_name_textbox,
+            average_accuracy_video,
+            each_task_accuracy[id2questiontype[1]],
+            each_task_accuracy[id2questiontype[2]],
+            each_task_accuracy[id2questiontype[3]],
+            each_task_accuracy[id2questiontype[4]],
+            each_task_accuracy[id2questiontype[5]],
+            each_task_accuracy[id2questiontype[6]],
+            each_task_accuracy[id2questiontype[7]],
+            each_task_accuracy[id2questiontype[8]],
+            each_task_accuracy[id2questiontype[9]],
+            ]
+        csv_data.loc[col] = new_data
+        with scheduler.lock:
+            csv_data = csv_data.to_csv(CSV_DIR, index=False)
+            save_json(model_name_textbox, userdata)
+    return str(average_accuracy_video) + "%", get_result_df()
+block = gr.Blocks()
+with block:
+    gr.Markdown(
+        LEADERBORAD_INTRODUCTION
+    )
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem(" 🏆  AutoEval-Video Benchmark", elem_id="AutoEval-Video-tab-table", id=0):
+            with gr.Row():
+                with gr.Accordion("Citation", open=False):
+                    citation_button = gr.Textbox(
+                        value=CITATION_BUTTON_TEXT,
+                        label=CITATION_BUTTON_LABEL,
+                        interactive=False,
+                        elem_id="citation-button",
+                    ).style(show_copy_button=True)
+            gr.Markdown(
+                TABLE_INTRODUCTION
+            )
+            data_component = gr.components.Dataframe(
+                value=get_result_df,
+                headers=COLUMN_NAMES,
+                type="pandas",
+                datatype=DATA_TITILE_TYPE,
+                interactive=False,
+                visible=True,
+                css=style,
+                )
+            with gr.Row():
+                data_run = gr.Button("Refresh")
+                data_run.click(
+                    get_result_df, outputs=data_component
+                )
+        with gr.TabItem("✨ Submit your model result here!", elem_id="AutoEval-Video-tab-table",id=1):
+            with gr.Row():
+                gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
+            with gr.Column():
+                model_name_textbox = gr.Textbox(
+                    label="Model name"
+                    )
+            with gr.Column():
+                input_file = gr.inputs.File(label = "Click to Upload a json File", file_count="single", type='binary')
+                submit_button = gr.Button("Evaluate")
+                overall_acc = gr.Textbox(label="Overall Acc.")
+                submit_button.click(
+                    add_new_eval,
+                    inputs = [
+                        input_file,
+                        model_name_textbox,
+                    ],
+                    outputs = [overall_acc, data_component],
+                )
+block.launch()