Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .DS_Store +0 -0
- app.py +23 -18
- auto_Mind2Web-Online - Leaderboard_data.csv +6 -0
- content.py +24 -15
- human_Mind2Web-Online - Leaderboard_data.csv +6 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
app.py
CHANGED
@@ -26,16 +26,18 @@ YEAR_VERSION = "2024"
|
|
26 |
LOCAL_DEBUG = True
|
27 |
|
28 |
# Display the results
|
29 |
-
def get_dataframe_from_results(
|
30 |
-
df = pd.
|
31 |
df = df.sort_values(by=["Average SR"], ascending=False)
|
32 |
-
|
33 |
-
|
34 |
-
# df[
|
35 |
return df
|
36 |
|
37 |
-
|
38 |
-
|
|
|
|
|
39 |
|
40 |
|
41 |
# def restart_space():
|
@@ -44,8 +46,9 @@ eval_dataframe_test = get_dataframe_from_results(eval_results=df)
|
|
44 |
TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"]
|
45 |
|
46 |
def refresh():
|
47 |
-
|
48 |
-
|
|
|
49 |
|
50 |
def upload_file(files):
|
51 |
file_paths = [file.name for file in files]
|
@@ -66,17 +69,18 @@ with demo:
|
|
66 |
lines=10,
|
67 |
) #.style(show_copy_button=True)
|
68 |
|
69 |
-
with gr.Tab("
|
70 |
-
|
71 |
-
value=
|
|
|
|
|
|
|
|
|
|
|
72 |
column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"]
|
73 |
)
|
74 |
-
# with gr.Tab("Online Mind2Web"):
|
75 |
-
# with gr.Tab("π About"):
|
76 |
-
# with gr.Row():
|
77 |
-
# gr.Image(value="./figure/Difficulty.png", label="Number of tasks by difficulty level", show_label=True, scale=0.4)
|
78 |
|
79 |
-
with gr.Tab("
|
80 |
with gr.Row():
|
81 |
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
|
82 |
|
@@ -85,7 +89,8 @@ with demo:
|
|
85 |
refresh,
|
86 |
inputs=[],
|
87 |
outputs=[
|
88 |
-
|
|
|
89 |
],
|
90 |
)
|
91 |
# gr.Markdown(DATA_DATASET, elem_classes="markdown-text")
|
|
|
26 |
LOCAL_DEBUG = True
|
27 |
|
28 |
# Display the results
|
29 |
+
def get_dataframe_from_results(eval_path):
|
30 |
+
df = pd.read_csv(eval_path)
|
31 |
df = df.sort_values(by=["Average SR"], ascending=False)
|
32 |
+
for format_column in ['Easy', 'Medium', 'Hard', 'Average SR']:
|
33 |
+
df[format_column] = df[format_column].map('{:.1f}'.format)
|
34 |
+
# df["Average SR"] = df["Average SR"].map('{:.1f}'.format)
|
35 |
return df
|
36 |
|
37 |
+
# auto_df = pd.read_csv("./auto_Mind2Web-Online - Leaderboard_data.csv")
|
38 |
+
# human_df = pd.read_csv("./human_Mind2Web-Online - Leaderboard_data.csv")
|
39 |
+
auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
|
40 |
+
human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
|
41 |
|
42 |
|
43 |
# def restart_space():
|
|
|
46 |
TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"]
|
47 |
|
48 |
def refresh():
|
49 |
+
auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
|
50 |
+
human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
|
51 |
+
return auto_eval_dataframe_test, human_eval_dataframe_test
|
52 |
|
53 |
def upload_file(files):
|
54 |
file_paths = [file.name for file in files]
|
|
|
69 |
lines=10,
|
70 |
) #.style(show_copy_button=True)
|
71 |
|
72 |
+
with gr.Tab("Human Evaluation", elem_id="human-tab", id=1):
|
73 |
+
human_leaderboard_table_test = gr.components.Dataframe(
|
74 |
+
value=human_eval_dataframe_test, datatype=TYPES, interactive=False,
|
75 |
+
column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"]
|
76 |
+
)
|
77 |
+
with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2):
|
78 |
+
auto_leaderboard_table_test = gr.components.Dataframe(
|
79 |
+
value=auto_eval_dataframe_test, datatype=TYPES, interactive=False,
|
80 |
column_widths=["15%", "15%", "15%", "15%", "10%", "10%", "10%", "10%", "15%"]
|
81 |
)
|
|
|
|
|
|
|
|
|
82 |
|
83 |
+
with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3):
|
84 |
with gr.Row():
|
85 |
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
|
86 |
|
|
|
89 |
refresh,
|
90 |
inputs=[],
|
91 |
outputs=[
|
92 |
+
auto_leaderboard_table_test,
|
93 |
+
human_leaderboard_table_test,
|
94 |
],
|
95 |
)
|
96 |
# gr.Markdown(DATA_DATASET, elem_classes="markdown-text")
|
auto_Mind2Web-Online - Leaderboard_data.csv
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date
|
2 |
+
Operator,Unknown,OpenAI,OSU NLP,80.3,73.4,59,71.8,2025-3-22
|
3 |
+
SeeAct,gpt-4o-2024-08-06,OSU,OSU NLP,65.1,36.1,18.5,39.8,2025-3-22
|
4 |
+
Browser Use,gpt-4o-2024-08-06,Browser Use,OSU NLP,58.6,37.5,24.3,40.1,2025-3-22
|
5 |
+
Claude Computer Use,claude-3-5-sonnet-20241022,Anthropic,OSU NLP,61.9,28.1,21.2,35.8,2025-3-22
|
6 |
+
Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,57.4,31.9,14.4,34.7,2025-3-22
|
content.py
CHANGED
@@ -1,18 +1,19 @@
|
|
1 |
TITLE = """<h1 align="center" id="space-title">π Online Mind2Web Leaderboard</h1>"""
|
2 |
|
3 |
INTRODUCTION_TEXT = """
|
4 |
-
Online Mind2Web is a benchmark designed to evaluate real-world performance of
|
5 |
|
6 |
[[Blog]]() [[Paper]]() [[Code]]() [[Data]]()
|
7 |
|
8 |
## Tasks
|
9 |
-
Online Mind2Web includes 300 tasks from 136 popular websites across various domains. It covers a diverse set of user tasks, to evaluate
|
10 |
-
Tasks are categorized into three levels of difficulty based on the number of steps required by human annotators: fewer than 5 steps are labeled as easy, 6 to 10 steps as medium, and 11 or more steps as hard.
|
11 |
|
12 |
-
|
|
|
|
|
|
|
13 |
|
14 |
## Leaderboard
|
15 |
-
Submission made by our team are labelled "OSUNLP".
|
16 |
"""
|
17 |
|
18 |
SUBMISSION_TEXT = """
|
@@ -27,28 +28,36 @@ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
|
27 |
CITATION_BUTTON_TEXT = r"""
|
28 |
Online Mind2Web"""
|
29 |
|
30 |
-
SUBMIT_INTRODUCTION = """
|
31 |
-
## β Please
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
33 |
```
|
34 |
-
|
35 |
-
βββ task_id
|
36 |
βββ result.json
|
37 |
βββ trajectory/
|
38 |
βββ 0_screenshot.png
|
39 |
βββ 1_screenshot.png
|
40 |
-
|
41 |
-
...
|
42 |
```
|
43 |
-
|
|
|
44 |
```json
|
45 |
{
|
46 |
"task_id": 123,
|
47 |
"task": "abc",
|
48 |
-
"action_history": ["abc", "xyz", ...]
|
49 |
}
|
50 |
```
|
51 |
-
Please send
|
|
|
|
|
|
|
52 |
"""
|
53 |
DATA_DATASET = """## More Statistics for Online Mind2Web Benchmark
|
54 |
"""
|
|
|
1 |
TITLE = """<h1 align="center" id="space-title">π Online Mind2Web Leaderboard</h1>"""
|
2 |
|
3 |
INTRODUCTION_TEXT = """
|
4 |
+
Online Mind2Web is a benchmark designed to evaluate real-world performance of web agents on online websites.
|
5 |
|
6 |
[[Blog]]() [[Paper]]() [[Code]]() [[Data]]()
|
7 |
|
8 |
## Tasks
|
9 |
+
Online Mind2Web includes 300 tasks from 136 popular websites across various domains. It covers a diverse set of user tasks, to evaluate agents' performance in real-world environments.
|
|
|
10 |
|
11 |
+
Tasks are categorized into three difficulty levels based on the steps human annotators need:
|
12 |
+
- Easy: 1 - 5
|
13 |
+
- Medium: 6 - 10
|
14 |
+
- Hard: 11 +
|
15 |
|
16 |
## Leaderboard
|
|
|
17 |
"""
|
18 |
|
19 |
SUBMISSION_TEXT = """
|
|
|
28 |
CITATION_BUTTON_TEXT = r"""
|
29 |
Online Mind2Web"""
|
30 |
|
31 |
+
SUBMIT_INTRODUCTION = """
|
32 |
+
## β Please submit the trajectory file with the following format:
|
33 |
+
Each task is stored in a folder named after its `task_id`, containing:
|
34 |
+
|
35 |
+
- `trajectory/`: Stores screenshots of each step.
|
36 |
+
- `result.json`: Task metadata and action history.
|
37 |
+
|
38 |
+
**Structure:**
|
39 |
```
|
40 |
+
main_directory/
|
41 |
+
βββ task_id/
|
42 |
βββ result.json
|
43 |
βββ trajectory/
|
44 |
βββ 0_screenshot.png
|
45 |
βββ 1_screenshot.png
|
46 |
+
βββ ...
|
|
|
47 |
```
|
48 |
+
|
49 |
+
**`result.json` format:**
|
50 |
```json
|
51 |
{
|
52 |
"task_id": 123,
|
53 |
"task": "abc",
|
54 |
+
"action_history": ["abc", "xyz", "..."]
|
55 |
}
|
56 |
```
|
57 |
+
Please send your agent's name, model family, and organization via email to [email protected], along with the trajectory directory attached.
|
58 |
+
|
59 |
+
We will run the auto-evaluation. If you have conducted your own human evaluation, please also attach your human eval resultsβwe will spot-check these before adding them to the human-eval table.
|
60 |
+
|
61 |
"""
|
62 |
DATA_DATASET = """## More Statistics for Online Mind2Web Benchmark
|
63 |
"""
|
human_Mind2Web-Online - Leaderboard_data.csv
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date
|
2 |
+
Operator,Unknown,OpenAI,OSU NLP,83.1,58.0,43.2,61.3,2025-3-22
|
3 |
+
SeeAct,gpt-4o-2024-08-06,OSU,OSU NLP,60.2,25.2,8.1,30.7,2025-3-22
|
4 |
+
Browser Use,gpt-4o-2024-08-06,Browser Use,OSU NLP,55.4,26.6,8.1,30.0,2025-3-22
|
5 |
+
Claude Computer Use,claude-3-5-sonnet-20241022,Anthropic,OSU NLP,56.6,20.3,14.9,29.0,2025-3-22
|
6 |
+
Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,49.4,26.6,6.8,28.0,2025-3-22
|