Spaces:

WeijianQi1999
/

Test

Sleeping

App Files Files Community

WeijianQi1999 commited on Mar 24

Commit

76a84b9

verified ·

1 Parent(s): e8785b1

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

.DS_Store +0 -0
app.py +23 -18
auto_Mind2Web-Online - Leaderboard_data.csv +6 -0
content.py +24 -15
human_Mind2Web-Online - Leaderboard_data.csv +6 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

app.py CHANGED Viewed

@@ -26,16 +26,18 @@ YEAR_VERSION = "2024"
 LOCAL_DEBUG = True
 # Display the results
-def get_dataframe_from_results(eval_results):
-    df = pd.DataFrame(eval_results)
     df = df.sort_values(by=["Average SR"], ascending=False)
-    # numeric_cols = [c for c in df.columns if "SR" in c]
-    # df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
     return df
-df = pd.read_csv("./Mind2Web-Online - Leaderboard_data.csv")
-eval_dataframe_test = get_dataframe_from_results(eval_results=df)
 # def restart_space():
@@ -44,8 +46,9 @@ eval_dataframe_test = get_dataframe_from_results(eval_results=df)
 TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"]
 def refresh():
-    eval_dataframe_test = get_dataframe_from_results(eval_results=df)
-    return eval_dataframe_test
 def upload_file(files):
     file_paths = [file.name for file in files]
@@ -66,17 +69,18 @@ with demo:
                 lines=10,
             ) #.style(show_copy_button=True)
-    with gr.Tab("Online Mind2Web"):
-        leaderboard_table_test = gr.components.Dataframe(
-            value=eval_dataframe_test, datatype=TYPES, interactive=False,
             column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"]
         )
-    # with gr.Tab("Online Mind2Web"):
-    # with gr.Tab("📝 About"):
-    #     with gr.Row():
-    #         gr.Image(value="./figure/Difficulty.png", label="Number of tasks by difficulty level", show_label=True, scale=0.4)
-    with gr.Tab("🚀 Submit here! ", elem_id="submit-tab", id=3):
         with gr.Row():
             gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
@@ -85,7 +89,8 @@ with demo:
         refresh,
         inputs=[],
         outputs=[
-            leaderboard_table_test,
         ],
     )
     # gr.Markdown(DATA_DATASET, elem_classes="markdown-text")

 LOCAL_DEBUG = True
 # Display the results
+def get_dataframe_from_results(eval_path):
+    df = pd.read_csv(eval_path)
     df = df.sort_values(by=["Average SR"], ascending=False)
+    for format_column in ['Easy', 'Medium', 'Hard', 'Average SR']:
+        df[format_column] = df[format_column].map('{:.1f}'.format)
+    # df["Average SR"] = df["Average SR"].map('{:.1f}'.format)
     return df
+# auto_df = pd.read_csv("./auto_Mind2Web-Online - Leaderboard_data.csv")
+# human_df = pd.read_csv("./human_Mind2Web-Online - Leaderboard_data.csv")
+auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
+human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
 # def restart_space():
 TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"]
 def refresh():
+    auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
+    human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
+    return auto_eval_dataframe_test, human_eval_dataframe_test
 def upload_file(files):
     file_paths = [file.name for file in files]
                 lines=10,
             ) #.style(show_copy_button=True)
+    with gr.Tab("Human Evaluation", elem_id="human-tab", id=1):
+        human_leaderboard_table_test = gr.components.Dataframe(
+            value=human_eval_dataframe_test, datatype=TYPES, interactive=False,
+            column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"]
+        )
+    with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2):
+        auto_leaderboard_table_test = gr.components.Dataframe(
+            value=auto_eval_dataframe_test, datatype=TYPES, interactive=False,
             column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"]
         )
+    with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3):
         with gr.Row():
             gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
         refresh,
         inputs=[],
         outputs=[
+            auto_leaderboard_table_test,
+            human_leaderboard_table_test,
         ],
     )
     # gr.Markdown(DATA_DATASET, elem_classes="markdown-text")

auto_Mind2Web-Online - Leaderboard_data.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date
+Operator,Unknown,OpenAI,OSU NLP,80.3,73.4,59,71.8,2025-3-22
+SeeAct,gpt-4o-2024-08-06,OSU,OSU NLP,65.1,36.1,18.5,39.8,2025-3-22
+Browser Use,gpt-4o-2024-08-06,Browser Use,OSU NLP,58.6,37.5,24.3,40.1,2025-3-22
+Claude Computer Use,claude-3-5-sonnet-20241022,Anthropic,OSU NLP,61.9,28.1,21.2,35.8,2025-3-22
+Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,57.4,31.9,14.4,34.7,2025-3-22

content.py CHANGED Viewed

@@ -1,18 +1,19 @@
 TITLE = """<h1 align="center" id="space-title">🏆 Online Mind2Web Leaderboard</h1>"""
 INTRODUCTION_TEXT = """
-Online Mind2Web is a benchmark designed to evaluate real-world performance of next-generation web agents on dynamic online websites.
 [[Blog]]() [[Paper]]() [[Code]]() [[Data]]()
 ## Tasks
-Online Mind2Web includes 300 tasks from 136 popular websites across various domains. It covers a diverse set of user tasks, to evaluate web agents' performance in a real-world online environment.
-Tasks are categorized into three levels of difficulty based on the number of steps required by human annotators: fewer than 5 steps are labeled as easy, 6 to 10 steps as medium, and 11 or more steps as hard.
-Online Mind2Web tasks can be found in [this dataset]().
 ## Leaderboard
-Submission made by our team are labelled "OSUNLP".
 """
 SUBMISSION_TEXT = """
@@ -27,28 +28,36 @@ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
 Online Mind2Web"""
-SUBMIT_INTRODUCTION = """# Submit on Online Mind2Web Leaderboard Introduction
-## ⚠ Please note that you need to submit the trajectory file with the following format:
-We require each separate directory to store the trajectory of each task and put all tasks' trajectories into one directory. The format for one task's trajectory is as follows:
 ```
-example
-└── task_id
     ├── result.json
     └── trajectory/
         ├── 0_screenshot.png
         ├── 1_screenshot.png
-        ├── 2_screenshot.png
-        ...
 ```
-The format of the result.json file.
 ```json
 {
     "task_id": 123,
     "task": "abc",
-    "action_history": ["abc", "xyz", ...]
 }
 ```
-Please send us your agent's name, model family, and organization via email at [email protected], along with the trajectory directory attached.
 """
 DATA_DATASET = """## More Statistics for Online Mind2Web Benchmark
 """

 TITLE = """<h1 align="center" id="space-title">🏆 Online Mind2Web Leaderboard</h1>"""
 INTRODUCTION_TEXT = """
+Online Mind2Web is a benchmark designed to evaluate real-world performance of web agents on online websites.
 [[Blog]]() [[Paper]]() [[Code]]() [[Data]]()
 ## Tasks
+Online Mind2Web includes 300 tasks from 136 popular websites across various domains. It covers a diverse set of user tasks, to evaluate agents' performance in real-world environments.
+Tasks are categorized into three difficulty levels based on the steps human annotators need:
+- Easy: 1 - 5
+- Medium: 6 - 10
+- Hard: 11 +
 ## Leaderboard
 """
 SUBMISSION_TEXT = """
 CITATION_BUTTON_TEXT = r"""
 Online Mind2Web"""
+SUBMIT_INTRODUCTION = """
+## ⚠ Please submit the trajectory file with the following format:
+Each task is stored in a folder named after its `task_id`, containing:
+- `trajectory/`: Stores screenshots of each step.
+- `result.json`: Task metadata and action history.
+**Structure:**
 ```
+main_directory/
+└── task_id/
     ├── result.json
     └── trajectory/
         ├── 0_screenshot.png
         ├── 1_screenshot.png
+        └── ...
 ```
+**`result.json` format:**
 ```json
 {
     "task_id": 123,
     "task": "abc",
+    "action_history": ["abc", "xyz", "..."]
 }
 ```
+Please send your agent's name, model family, and organization via email to [email protected], along with the trajectory directory attached.
+We will run the auto-evaluation. If you have conducted your own human evaluation, please also attach your human eval results—we will spot-check these before adding them to the human-eval table.
 """
 DATA_DATASET = """## More Statistics for Online Mind2Web Benchmark
 """

human_Mind2Web-Online - Leaderboard_data.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date
+Operator,Unknown,OpenAI,OSU NLP,83.1,58.0,43.2,61.3,2025-3-22
+SeeAct,gpt-4o-2024-08-06,OSU,OSU NLP,60.2,25.2,8.1,30.7,2025-3-22
+Browser Use,gpt-4o-2024-08-06,Browser Use,OSU NLP,55.4,26.6,8.1,30.0,2025-3-22
+Claude Computer Use,claude-3-5-sonnet-20241022,Anthropic,OSU NLP,56.6,20.3,14.9,29.0,2025-3-22
+Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,49.4,26.6,6.8,28.0,2025-3-22