xhluca
commited on
Commit
·
99e2870
1
Parent(s):
331ed33
Simplify table
Browse files
demo.py
CHANGED
@@ -484,95 +484,94 @@ base_screenshot_dir = Path(base_screenshot_dir)
|
|
484 |
|
485 |
hl_action_parser = _build_highlevel_action_parser()
|
486 |
|
487 |
-
with gr.Blocks(title="AgentRewardBench Demo") as demo
|
488 |
gr.Markdown(
|
489 |
"""
|
490 |
-
# AgentRewardBench Demo
|
491 |
-
| [**🤗Dataset**](https://huggingface.co/datasets/McGill-NLP/agent-reward-bench) | **📄Paper (TBA)** | [**🌐Website**](https://agent-reward-bench.github.io) | [**🏆Leaderboard**](https://huggingface.co/spaces/McGill-NLP/agent-reward-bench-leaderboard) | [**💻Demo**](https://huggingface.co/spaces/McGill-NLP/agent-reward-bench-demo)
|
492 |
-
| :--: | :--: | :--: | :--: | :--: |
|
493 |
"""
|
494 |
)
|
495 |
-
with gr.
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
agents = list_agents(base_traj_dir, benchmark_default)
|
502 |
-
model_dd = gr.Dropdown(label="Agent", choices=agents, value=agents[0])
|
503 |
-
|
504 |
-
task_ids = list_task_ids(base_traj_dir, benchmark_default, agents[0])
|
505 |
-
task_id_dd = gr.Dropdown(label="Task ID", choices=task_ids, value=task_ids[0])
|
506 |
-
|
507 |
-
@benchmark_dd.change(inputs=[benchmark_dd], outputs=[model_dd])
|
508 |
-
def update_agents(benchmark):
|
509 |
-
agents = list_agents(base_traj_dir, benchmark)
|
510 |
-
return gr.Dropdown(label="Agent", choices=agents, value=agents[0])
|
511 |
-
|
512 |
-
@model_dd.change(inputs=[benchmark_dd, model_dd], outputs=[task_id_dd])
|
513 |
-
def update_task_ids(benchmark, agent):
|
514 |
-
task_ids = list_task_ids(base_traj_dir, benchmark, agent)
|
515 |
-
|
516 |
-
return gr.Dropdown(choices=task_ids, value=task_ids[0])
|
517 |
-
|
518 |
-
with gr.Column(scale=8):
|
519 |
-
@gr.render(inputs=[benchmark_dd, model_dd, task_id_dd])
|
520 |
-
def render_trajectory(benchmark, agent, task_id):
|
521 |
-
traj_path = get_trajectory_path(base_traj_dir, benchmark, agent, task_id)
|
522 |
-
with open(traj_path, "rb") as f:
|
523 |
-
traj = orjson.loads(f.read())
|
524 |
-
|
525 |
-
goal = replace_string_content(traj["goal"])
|
526 |
|
527 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
528 |
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
|
|
534 |
|
535 |
-
|
536 |
-
|
537 |
-
im = Image.open(screenshot_path)
|
538 |
-
im = apply_overlay_to_image(
|
539 |
-
im, step, highlevel_action_parser=hl_action_parser
|
540 |
)
|
541 |
-
|
542 |
-
|
543 |
-
if reasoning is not None:
|
544 |
-
gr.Textbox(reasoning, label="Reasoning", lines=4)
|
545 |
-
if action is not None:
|
546 |
-
gr.Textbox(action, label="Action", lines=2)
|
547 |
-
|
548 |
-
# multi-choices dropdown for judges
|
549 |
-
judge_dd = gr.Dropdown(
|
550 |
-
label="Judges",
|
551 |
-
choices=list(judges_dict.values()),
|
552 |
-
multiselect=True,
|
553 |
-
value=default_judges,
|
554 |
-
)
|
555 |
-
|
556 |
-
@gr.render(inputs=[benchmark_dd, model_dd, task_id_dd, judge_dd])
|
557 |
-
def render_judge(benchmark, agent, task_id, judge_choices):
|
558 |
-
# load judgments
|
559 |
-
for judge in judges_dict.values():
|
560 |
-
if judge not in judge_choices:
|
561 |
-
continue
|
562 |
-
|
563 |
-
judgment_path = get_judgment_path(
|
564 |
-
base_judgments_dir, benchmark, agent, judge, task_id
|
565 |
-
)
|
566 |
-
if not judgment_path.exists():
|
567 |
-
continue
|
568 |
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
|
573 |
-
|
574 |
-
|
575 |
|
576 |
-
|
577 |
|
578 |
demo.launch()
|
|
|
484 |
|
485 |
hl_action_parser = _build_highlevel_action_parser()
|
486 |
|
487 |
+
with gr.Blocks(title="AgentRewardBench Demo") as demo:
|
488 |
gr.Markdown(
|
489 |
"""
|
490 |
+
# AgentRewardBench Demo ([Website](https://agent-reward-bench.github.io))
|
|
|
|
|
491 |
"""
|
492 |
)
|
493 |
+
with gr.Row():
|
494 |
+
with gr.Column(scale=4):
|
495 |
+
benchmark_default = "WebArena"
|
496 |
+
benchmark_dd = gr.Dropdown(
|
497 |
+
label="Benchmark", choices=list_benchmarks(base_traj_dir), value=benchmark_default
|
498 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
499 |
|
500 |
+
agents = list_agents(base_traj_dir, benchmark_default)
|
501 |
+
model_dd = gr.Dropdown(label="Agent", choices=agents, value=agents[0])
|
502 |
+
|
503 |
+
task_ids = list_task_ids(base_traj_dir, benchmark_default, agents[0])
|
504 |
+
task_id_dd = gr.Dropdown(label="Task ID", choices=task_ids, value=task_ids[0])
|
505 |
+
|
506 |
+
@benchmark_dd.change(inputs=[benchmark_dd], outputs=[model_dd])
|
507 |
+
def update_agents(benchmark):
|
508 |
+
agents = list_agents(base_traj_dir, benchmark)
|
509 |
+
return gr.Dropdown(label="Agent", choices=agents, value=agents[0])
|
510 |
+
|
511 |
+
@model_dd.change(inputs=[benchmark_dd, model_dd], outputs=[task_id_dd])
|
512 |
+
def update_task_ids(benchmark, agent):
|
513 |
+
task_ids = list_task_ids(base_traj_dir, benchmark, agent)
|
514 |
+
|
515 |
+
return gr.Dropdown(choices=task_ids, value=task_ids[0])
|
516 |
+
|
517 |
+
with gr.Column(scale=8):
|
518 |
+
@gr.render(inputs=[benchmark_dd, model_dd, task_id_dd])
|
519 |
+
def render_trajectory(benchmark, agent, task_id):
|
520 |
+
traj_path = get_trajectory_path(base_traj_dir, benchmark, agent, task_id)
|
521 |
+
with open(traj_path, "rb") as f:
|
522 |
+
traj = orjson.loads(f.read())
|
523 |
+
|
524 |
+
goal = replace_string_content(traj["goal"])
|
525 |
+
|
526 |
+
gr.Textbox(label="Goal", value=goal, visible=True)
|
527 |
+
|
528 |
+
for step in traj["steps"]:
|
529 |
+
num = step["num"]
|
530 |
+
action = step["action"]
|
531 |
+
reasoning = step["reasoning"]
|
532 |
+
screenshot_path = step["screenshot_path"]
|
533 |
+
|
534 |
+
gr.Markdown(f"# Step {num}")
|
535 |
+
with gr.Group():
|
536 |
+
im = Image.open(screenshot_path)
|
537 |
+
im = apply_overlay_to_image(
|
538 |
+
im, step, highlevel_action_parser=hl_action_parser
|
539 |
+
)
|
540 |
+
format_ = "webp" if im.format is None else im.format
|
541 |
+
gr.Image(im, label="Screenshot", format=format_)
|
542 |
+
if reasoning is not None:
|
543 |
+
gr.Textbox(reasoning, label="Reasoning", lines=4)
|
544 |
+
if action is not None:
|
545 |
+
gr.Textbox(action, label="Action", lines=2)
|
546 |
+
|
547 |
+
# multi-choices dropdown for judges
|
548 |
+
judge_dd = gr.Dropdown(
|
549 |
+
label="Judges",
|
550 |
+
choices=list(judges_dict.values()),
|
551 |
+
multiselect=True,
|
552 |
+
value=default_judges,
|
553 |
+
)
|
554 |
|
555 |
+
@gr.render(inputs=[benchmark_dd, model_dd, task_id_dd, judge_dd])
|
556 |
+
def render_judge(benchmark, agent, task_id, judge_choices):
|
557 |
+
# load judgments
|
558 |
+
for judge in judges_dict.values():
|
559 |
+
if judge not in judge_choices:
|
560 |
+
continue
|
561 |
|
562 |
+
judgment_path = get_judgment_path(
|
563 |
+
base_judgments_dir, benchmark, agent, judge, task_id
|
|
|
|
|
|
|
564 |
)
|
565 |
+
if not judgment_path.exists():
|
566 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
567 |
|
568 |
+
with open(judgment_path, "rb") as f:
|
569 |
+
judgment = orjson.loads(f.read())
|
570 |
+
if judge == "Rule-based":
|
571 |
+
msg = get_message_from_rule_based(judgment)
|
572 |
+
else:
|
573 |
+
msg = get_message_from_judgment(judgment)
|
574 |
|
575 |
+
gr.Textbox(label=judge, value=msg, lines=4)
|
576 |
|
577 |
demo.launch()
|