WeijianQi1999 commited on
Commit
76a84b9
Β·
verified Β·
1 Parent(s): e8785b1

Upload folder using huggingface_hub

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
app.py CHANGED
@@ -26,16 +26,18 @@ YEAR_VERSION = "2024"
26
  LOCAL_DEBUG = True
27
 
28
  # Display the results
29
- def get_dataframe_from_results(eval_results):
30
- df = pd.DataFrame(eval_results)
31
  df = df.sort_values(by=["Average SR"], ascending=False)
32
-
33
- # numeric_cols = [c for c in df.columns if "SR" in c]
34
- # df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
35
  return df
36
 
37
- df = pd.read_csv("./Mind2Web-Online - Leaderboard_data.csv")
38
- eval_dataframe_test = get_dataframe_from_results(eval_results=df)
 
 
39
 
40
 
41
  # def restart_space():
@@ -44,8 +46,9 @@ eval_dataframe_test = get_dataframe_from_results(eval_results=df)
44
  TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"]
45
 
46
  def refresh():
47
- eval_dataframe_test = get_dataframe_from_results(eval_results=df)
48
- return eval_dataframe_test
 
49
 
50
  def upload_file(files):
51
  file_paths = [file.name for file in files]
@@ -66,17 +69,18 @@ with demo:
66
  lines=10,
67
  ) #.style(show_copy_button=True)
68
 
69
- with gr.Tab("Online Mind2Web"):
70
- leaderboard_table_test = gr.components.Dataframe(
71
- value=eval_dataframe_test, datatype=TYPES, interactive=False,
 
 
 
 
 
72
  column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"]
73
  )
74
- # with gr.Tab("Online Mind2Web"):
75
- # with gr.Tab("πŸ“ About"):
76
- # with gr.Row():
77
- # gr.Image(value="./figure/Difficulty.png", label="Number of tasks by difficulty level", show_label=True, scale=0.4)
78
 
79
- with gr.Tab("πŸš€ Submit here! ", elem_id="submit-tab", id=3):
80
  with gr.Row():
81
  gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
82
 
@@ -85,7 +89,8 @@ with demo:
85
  refresh,
86
  inputs=[],
87
  outputs=[
88
- leaderboard_table_test,
 
89
  ],
90
  )
91
  # gr.Markdown(DATA_DATASET, elem_classes="markdown-text")
 
26
  LOCAL_DEBUG = True
27
 
28
  # Display the results
29
+ def get_dataframe_from_results(eval_path):
30
+ df = pd.read_csv(eval_path)
31
  df = df.sort_values(by=["Average SR"], ascending=False)
32
+ for format_column in ['Easy', 'Medium', 'Hard', 'Average SR']:
33
+ df[format_column] = df[format_column].map('{:.1f}'.format)
34
+ # df["Average SR"] = df["Average SR"].map('{:.1f}'.format)
35
  return df
36
 
37
+ # auto_df = pd.read_csv("./auto_Mind2Web-Online - Leaderboard_data.csv")
38
+ # human_df = pd.read_csv("./human_Mind2Web-Online - Leaderboard_data.csv")
39
+ auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
40
+ human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
41
 
42
 
43
  # def restart_space():
 
46
  TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"]
47
 
48
  def refresh():
49
+ auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
50
+ human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
51
+ return auto_eval_dataframe_test, human_eval_dataframe_test
52
 
53
  def upload_file(files):
54
  file_paths = [file.name for file in files]
 
69
  lines=10,
70
  ) #.style(show_copy_button=True)
71
 
72
+ with gr.Tab("Human Evaluation", elem_id="human-tab", id=1):
73
+ human_leaderboard_table_test = gr.components.Dataframe(
74
+ value=human_eval_dataframe_test, datatype=TYPES, interactive=False,
75
+ column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"]
76
+ )
77
+ with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2):
78
+ auto_leaderboard_table_test = gr.components.Dataframe(
79
+ value=auto_eval_dataframe_test, datatype=TYPES, interactive=False,
80
  column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"]
81
  )
 
 
 
 
82
 
83
+ with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3):
84
  with gr.Row():
85
  gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
86
 
 
89
  refresh,
90
  inputs=[],
91
  outputs=[
92
+ auto_leaderboard_table_test,
93
+ human_leaderboard_table_test,
94
  ],
95
  )
96
  # gr.Markdown(DATA_DATASET, elem_classes="markdown-text")
auto_Mind2Web-Online - Leaderboard_data.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date
2
+ Operator,Unknown,OpenAI,OSU NLP,80.3,73.4,59,71.8,2025-3-22
3
+ SeeAct,gpt-4o-2024-08-06,OSU,OSU NLP,65.1,36.1,18.5,39.8,2025-3-22
4
+ Browser Use,gpt-4o-2024-08-06,Browser Use,OSU NLP,58.6,37.5,24.3,40.1,2025-3-22
5
+ Claude Computer Use,claude-3-5-sonnet-20241022,Anthropic,OSU NLP,61.9,28.1,21.2,35.8,2025-3-22
6
+ Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,57.4,31.9,14.4,34.7,2025-3-22
content.py CHANGED
@@ -1,18 +1,19 @@
1
  TITLE = """<h1 align="center" id="space-title">πŸ† Online Mind2Web Leaderboard</h1>"""
2
 
3
  INTRODUCTION_TEXT = """
4
- Online Mind2Web is a benchmark designed to evaluate real-world performance of next-generation web agents on dynamic online websites.
5
 
6
  [[Blog]]() [[Paper]]() [[Code]]() [[Data]]()
7
 
8
  ## Tasks
9
- Online Mind2Web includes 300 tasks from 136 popular websites across various domains. It covers a diverse set of user tasks, to evaluate web agents' performance in a real-world online environment.
10
- Tasks are categorized into three levels of difficulty based on the number of steps required by human annotators: fewer than 5 steps are labeled as easy, 6 to 10 steps as medium, and 11 or more steps as hard.
11
 
12
- Online Mind2Web tasks can be found in [this dataset]().
 
 
 
13
 
14
  ## Leaderboard
15
- Submission made by our team are labelled "OSUNLP".
16
  """
17
 
18
  SUBMISSION_TEXT = """
@@ -27,28 +28,36 @@ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
27
  CITATION_BUTTON_TEXT = r"""
28
  Online Mind2Web"""
29
 
30
- SUBMIT_INTRODUCTION = """# Submit on Online Mind2Web Leaderboard Introduction
31
- ## ⚠ Please note that you need to submit the trajectory file with the following format:
32
- We require each separate directory to store the trajectory of each task and put all tasks' trajectories into one directory. The format for one task's trajectory is as follows:
 
 
 
 
 
33
  ```
34
- example
35
- └── task_id
36
  β”œβ”€β”€ result.json
37
  └── trajectory/
38
  β”œβ”€β”€ 0_screenshot.png
39
  β”œβ”€β”€ 1_screenshot.png
40
- β”œβ”€β”€ 2_screenshot.png
41
- ...
42
  ```
43
- The format of the result.json file.
 
44
  ```json
45
  {
46
  "task_id": 123,
47
  "task": "abc",
48
- "action_history": ["abc", "xyz", ...]
49
  }
50
  ```
51
- Please send us your agent's name, model family, and organization via email at [email protected], along with the trajectory directory attached.
 
 
 
52
  """
53
  DATA_DATASET = """## More Statistics for Online Mind2Web Benchmark
54
  """
 
1
  TITLE = """<h1 align="center" id="space-title">πŸ† Online Mind2Web Leaderboard</h1>"""
2
 
3
  INTRODUCTION_TEXT = """
4
+ Online Mind2Web is a benchmark designed to evaluate real-world performance of web agents on online websites.
5
 
6
  [[Blog]]() [[Paper]]() [[Code]]() [[Data]]()
7
 
8
  ## Tasks
9
+ Online Mind2Web includes 300 tasks from 136 popular websites across various domains. It covers a diverse set of user tasks, to evaluate agents' performance in real-world environments.
 
10
 
11
+ Tasks are categorized into three difficulty levels based on the steps human annotators need:
12
+ - Easy: 1 - 5
13
+ - Medium: 6 - 10
14
+ - Hard: 11 +
15
 
16
  ## Leaderboard
 
17
  """
18
 
19
  SUBMISSION_TEXT = """
 
28
  CITATION_BUTTON_TEXT = r"""
29
  Online Mind2Web"""
30
 
31
+ SUBMIT_INTRODUCTION = """
32
+ ## ⚠ Please submit the trajectory file with the following format:
33
+ Each task is stored in a folder named after its `task_id`, containing:
34
+
35
+ - `trajectory/`: Stores screenshots of each step.
36
+ - `result.json`: Task metadata and action history.
37
+
38
+ **Structure:**
39
  ```
40
+ main_directory/
41
+ └── task_id/
42
  β”œβ”€β”€ result.json
43
  └── trajectory/
44
  β”œβ”€β”€ 0_screenshot.png
45
  β”œβ”€β”€ 1_screenshot.png
46
+ └── ...
 
47
  ```
48
+
49
+ **`result.json` format:**
50
  ```json
51
  {
52
  "task_id": 123,
53
  "task": "abc",
54
+ "action_history": ["abc", "xyz", "..."]
55
  }
56
  ```
57
+ Please send your agent's name, model family, and organization via email to [email protected], along with the trajectory directory attached.
58
+
59
+ We will run the auto-evaluation. If you have conducted your own human evaluation, please also attach your human eval resultsβ€”we will spot-check these before adding them to the human-eval table.
60
+
61
  """
62
  DATA_DATASET = """## More Statistics for Online Mind2Web Benchmark
63
  """
human_Mind2Web-Online - Leaderboard_data.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date
2
+ Operator,Unknown,OpenAI,OSU NLP,83.1,58.0,43.2,61.3,2025-3-22
3
+ SeeAct,gpt-4o-2024-08-06,OSU,OSU NLP,60.2,25.2,8.1,30.7,2025-3-22
4
+ Browser Use,gpt-4o-2024-08-06,Browser Use,OSU NLP,55.4,26.6,8.1,30.0,2025-3-22
5
+ Claude Computer Use,claude-3-5-sonnet-20241022,Anthropic,OSU NLP,56.6,20.3,14.9,29.0,2025-3-22
6
+ Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,49.4,26.6,6.8,28.0,2025-3-22