import os import json import datetime import requests from email.utils import parseaddr import gradio as gr import pandas as pd import numpy as np from datasets import load_dataset, VerificationMode from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import HfApi # InfoStrings from scorer import question_scorer from content import format_error, format_warning, format_log, TITLE, DATA_DATASET, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION TOKEN = os.environ.get("TOKEN", None) OWNER="Online Mind2Web" # api = HfApi() YEAR_VERSION = "2024" LOCAL_DEBUG = True # Display the results def get_dataframe_from_results(eval_path): df = pd.read_csv(eval_path) df = df.sort_values(by=["Average SR"], ascending=False) for format_column in ['Easy', 'Medium', 'Hard', 'Average SR']: df[format_column] = df[format_column].map('{:.1f}'.format) # df["Average SR"] = df["Average SR"].map('{:.1f}'.format) return df # auto_df = pd.read_csv("./auto_Mind2Web-Online - Leaderboard_data.csv") # human_df = pd.read_csv("./human_Mind2Web-Online - Leaderboard_data.csv") auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv') human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv') # def restart_space(): # api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN) TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"] def refresh(): auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv') human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv') return auto_eval_dataframe_test, human_eval_dataframe_test def upload_file(files): file_paths = [file.name for file in files] return file_paths demo = gr.Blocks() with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button", lines=10, ) #.style(show_copy_button=True) with gr.Tab("Human Evaluation", elem_id="human-tab", id=1): human_leaderboard_table_test = gr.components.Dataframe( value=human_eval_dataframe_test, datatype=TYPES, interactive=False, column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"] ) with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2): auto_leaderboard_table_test = gr.components.Dataframe( value=auto_eval_dataframe_test, datatype=TYPES, interactive=False, column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"] ) with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3): with gr.Row(): gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text") refresh_button = gr.Button("Refresh") refresh_button.click( refresh, inputs=[], outputs=[ auto_leaderboard_table_test, human_leaderboard_table_test, ], ) # gr.Markdown(DATA_DATASET, elem_classes="markdown-text") # with gr.Row(): # # gr.Image(value="./figure/distribution_reference_length.png", label="Distribution of reference length", show_label=True, scale=0.4) # gr.Image(value="./figure/Difficulty.png", label="Number of tasks by difficulty level", show_label=True, scale=0.4) # with gr.Row(): # gr.Image(value="./figure/distribution_website.jpg", label="Distribution of websites.",show_label=True, scale=0.4) # with gr.Row(): # gr.Image(value="./figure/popularity.jpg", label="Popularity of websites.", show_label=True, scale=0.4) # with gr.Accordion("Submit a new agent for evaluation"): # with gr.Row(): # gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text") # with gr.Row(): # with gr.Column(): # model_name_textbox = gr.Textbox(label="Agent name") # model_family_textbox = gr.Textbox(label="Model family") # organisation = gr.Textbox(label="Organization") # mail = gr.Textbox(label="Contact email (will be stored privately, & used if there is an issue with your submission)") # file_output = gr.File() # with gr.Row(): # gr.LoginButton() # submit_button = gr.Button("Submit Eval") # submission_result = gr.Markdown() # submit_button.click( # [ # level_of_test, # model_name_textbox, # model_family_textbox, # system_prompt_textbox, # url_textbox, # file_output, # organisation, # mail # ], # submission_result, # ) scheduler = BackgroundScheduler() # scheduler.add_job(restart_space, "interval", seconds=3600) scheduler.start() demo.launch(debug=True)