File size: 5,312 Bytes
f3c542d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76a84b9
 
f3c542d
76a84b9
 
 
f3c542d
 
76a84b9
 
 
 
f3c542d
 
 
 
 
 
 
 
76a84b9
 
 
f3c542d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76a84b9
 
 
 
 
 
 
 
f3c542d
 
6a39178
76a84b9
6a39178
 
f3c542d
 
 
 
 
 
76a84b9
 
f3c542d
 
6a39178
 
 
 
 
 
 
 
f3c542d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
import json
import datetime
import requests
from email.utils import parseaddr

import gradio as gr
import pandas as pd
import numpy as np

from datasets import load_dataset, VerificationMode
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi

# InfoStrings
from scorer import question_scorer
from content import format_error, format_warning, format_log, TITLE, DATA_DATASET, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION

TOKEN = os.environ.get("TOKEN", None)

OWNER="Online Mind2Web"
# api = HfApi()

YEAR_VERSION = "2024"

LOCAL_DEBUG = True

# Display the results
def get_dataframe_from_results(eval_path):
    df = pd.read_csv(eval_path)
    df = df.sort_values(by=["Average SR"], ascending=False)
    for format_column in ['Easy', 'Medium', 'Hard', 'Average SR']:
        df[format_column] = df[format_column].map('{:.1f}'.format)
    # df["Average SR"] = df["Average SR"].map('{:.1f}'.format)
    return df

# auto_df = pd.read_csv("./auto_Mind2Web-Online - Leaderboard_data.csv")
# human_df = pd.read_csv("./human_Mind2Web-Online - Leaderboard_data.csv")
auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')


# def restart_space():
#     api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)

TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"]

def refresh():
    auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
    human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
    return auto_eval_dataframe_test, human_eval_dataframe_test

def upload_file(files):
    file_paths = [file.name for file in files]
    return file_paths


demo = gr.Blocks()
with demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Row():
        with gr.Accordion("πŸ“™ Citation", open=False):
            citation_button = gr.Textbox(
                value=CITATION_BUTTON_TEXT,
                label=CITATION_BUTTON_LABEL,
                elem_id="citation-button",
                lines=10,
            ) #.style(show_copy_button=True)

    with gr.Tab("Human Evaluation", elem_id="human-tab", id=1):
        human_leaderboard_table_test = gr.components.Dataframe(
            value=human_eval_dataframe_test, datatype=TYPES, interactive=False,
            column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"]
        )
    with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2):
        auto_leaderboard_table_test = gr.components.Dataframe(
            value=auto_eval_dataframe_test, datatype=TYPES, interactive=False,
            column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"]
        )

    with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3):
        with gr.Row():
            gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")

    refresh_button = gr.Button("Refresh")
    refresh_button.click(
        refresh,
        inputs=[],
        outputs=[
            auto_leaderboard_table_test,
            human_leaderboard_table_test,
        ],
    )
    # gr.Markdown(DATA_DATASET, elem_classes="markdown-text")
    # with gr.Row():
    #     # gr.Image(value="./figure/distribution_reference_length.png", label="Distribution of reference length", show_label=True, scale=0.4)
    #     gr.Image(value="./figure/Difficulty.png", label="Number of tasks by difficulty level", show_label=True, scale=0.4)
    # with gr.Row():
    #     gr.Image(value="./figure/distribution_website.jpg", label="Distribution of websites.",show_label=True, scale=0.4)
    # with gr.Row():
    #     gr.Image(value="./figure/popularity.jpg", label="Popularity of websites.", show_label=True, scale=0.4)
    # with gr.Accordion("Submit a new agent for evaluation"):
    #     with gr.Row():
    #         gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
    #     with gr.Row():
    #         with gr.Column():
    #             model_name_textbox = gr.Textbox(label="Agent name")
    #             model_family_textbox = gr.Textbox(label="Model family")
    #             organisation = gr.Textbox(label="Organization")
    #             mail = gr.Textbox(label="Contact email (will be stored privately, & used if there is an issue with your submission)")
    #             file_output = gr.File()


    #     with gr.Row():
    #         gr.LoginButton()
    #         submit_button = gr.Button("Submit Eval")
    #     submission_result = gr.Markdown()
        # submit_button.click(
        #     [
        #         level_of_test,
        #         model_name_textbox,
        #         model_family_textbox,
        #         system_prompt_textbox,
        #         url_textbox,
        #         file_output,
        #         organisation,
        #         mail
        #     ],
        #     submission_result,
        # )

scheduler = BackgroundScheduler()
# scheduler.add_job(restart_space, "interval", seconds=3600)
scheduler.start()
demo.launch(debug=True)