xhluca commited on
Commit
99e2870
·
1 Parent(s): 331ed33

Simplify table

Browse files
Files changed (1) hide show
  1. demo.py +79 -80
demo.py CHANGED
@@ -484,95 +484,94 @@ base_screenshot_dir = Path(base_screenshot_dir)
484
 
485
  hl_action_parser = _build_highlevel_action_parser()
486
 
487
- with gr.Blocks(title="AgentRewardBench Demo") as demo, gr.Row():
488
  gr.Markdown(
489
  """
490
- # AgentRewardBench Demo
491
- | [**🤗Dataset**](https://huggingface.co/datasets/McGill-NLP/agent-reward-bench) | **📄Paper (TBA)** | [**🌐Website**](https://agent-reward-bench.github.io) | [**🏆Leaderboard**](https://huggingface.co/spaces/McGill-NLP/agent-reward-bench-leaderboard) | [**💻Demo**](https://huggingface.co/spaces/McGill-NLP/agent-reward-bench-demo)
492
- | :--: | :--: | :--: | :--: | :--: |
493
  """
494
  )
495
- with gr.Column(scale=4):
496
- benchmark_default = "WebArena"
497
- benchmark_dd = gr.Dropdown(
498
- label="Benchmark", choices=list_benchmarks(base_traj_dir), value=benchmark_default
499
- )
500
-
501
- agents = list_agents(base_traj_dir, benchmark_default)
502
- model_dd = gr.Dropdown(label="Agent", choices=agents, value=agents[0])
503
-
504
- task_ids = list_task_ids(base_traj_dir, benchmark_default, agents[0])
505
- task_id_dd = gr.Dropdown(label="Task ID", choices=task_ids, value=task_ids[0])
506
-
507
- @benchmark_dd.change(inputs=[benchmark_dd], outputs=[model_dd])
508
- def update_agents(benchmark):
509
- agents = list_agents(base_traj_dir, benchmark)
510
- return gr.Dropdown(label="Agent", choices=agents, value=agents[0])
511
-
512
- @model_dd.change(inputs=[benchmark_dd, model_dd], outputs=[task_id_dd])
513
- def update_task_ids(benchmark, agent):
514
- task_ids = list_task_ids(base_traj_dir, benchmark, agent)
515
-
516
- return gr.Dropdown(choices=task_ids, value=task_ids[0])
517
-
518
- with gr.Column(scale=8):
519
- @gr.render(inputs=[benchmark_dd, model_dd, task_id_dd])
520
- def render_trajectory(benchmark, agent, task_id):
521
- traj_path = get_trajectory_path(base_traj_dir, benchmark, agent, task_id)
522
- with open(traj_path, "rb") as f:
523
- traj = orjson.loads(f.read())
524
-
525
- goal = replace_string_content(traj["goal"])
526
 
527
- gr.Textbox(label="Goal", value=goal, visible=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
 
529
- for step in traj["steps"]:
530
- num = step["num"]
531
- action = step["action"]
532
- reasoning = step["reasoning"]
533
- screenshot_path = step["screenshot_path"]
 
534
 
535
- gr.Markdown(f"# Step {num}")
536
- with gr.Group():
537
- im = Image.open(screenshot_path)
538
- im = apply_overlay_to_image(
539
- im, step, highlevel_action_parser=hl_action_parser
540
  )
541
- format_ = "webp" if im.format is None else im.format
542
- gr.Image(im, label="Screenshot", format=format_)
543
- if reasoning is not None:
544
- gr.Textbox(reasoning, label="Reasoning", lines=4)
545
- if action is not None:
546
- gr.Textbox(action, label="Action", lines=2)
547
-
548
- # multi-choices dropdown for judges
549
- judge_dd = gr.Dropdown(
550
- label="Judges",
551
- choices=list(judges_dict.values()),
552
- multiselect=True,
553
- value=default_judges,
554
- )
555
-
556
- @gr.render(inputs=[benchmark_dd, model_dd, task_id_dd, judge_dd])
557
- def render_judge(benchmark, agent, task_id, judge_choices):
558
- # load judgments
559
- for judge in judges_dict.values():
560
- if judge not in judge_choices:
561
- continue
562
-
563
- judgment_path = get_judgment_path(
564
- base_judgments_dir, benchmark, agent, judge, task_id
565
- )
566
- if not judgment_path.exists():
567
- continue
568
 
569
- with open(judgment_path, "rb") as f:
570
- judgment = orjson.loads(f.read())
571
- if judge == "Rule-based":
572
- msg = get_message_from_rule_based(judgment)
573
- else:
574
- msg = get_message_from_judgment(judgment)
575
 
576
- gr.Textbox(label=judge, value=msg, lines=4)
577
 
578
  demo.launch()
 
484
 
485
  hl_action_parser = _build_highlevel_action_parser()
486
 
487
+ with gr.Blocks(title="AgentRewardBench Demo") as demo:
488
  gr.Markdown(
489
  """
490
+ # AgentRewardBench Demo ([Website](https://agent-reward-bench.github.io))
 
 
491
  """
492
  )
493
+ with gr.Row():
494
+ with gr.Column(scale=4):
495
+ benchmark_default = "WebArena"
496
+ benchmark_dd = gr.Dropdown(
497
+ label="Benchmark", choices=list_benchmarks(base_traj_dir), value=benchmark_default
498
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
499
 
500
+ agents = list_agents(base_traj_dir, benchmark_default)
501
+ model_dd = gr.Dropdown(label="Agent", choices=agents, value=agents[0])
502
+
503
+ task_ids = list_task_ids(base_traj_dir, benchmark_default, agents[0])
504
+ task_id_dd = gr.Dropdown(label="Task ID", choices=task_ids, value=task_ids[0])
505
+
506
+ @benchmark_dd.change(inputs=[benchmark_dd], outputs=[model_dd])
507
+ def update_agents(benchmark):
508
+ agents = list_agents(base_traj_dir, benchmark)
509
+ return gr.Dropdown(label="Agent", choices=agents, value=agents[0])
510
+
511
+ @model_dd.change(inputs=[benchmark_dd, model_dd], outputs=[task_id_dd])
512
+ def update_task_ids(benchmark, agent):
513
+ task_ids = list_task_ids(base_traj_dir, benchmark, agent)
514
+
515
+ return gr.Dropdown(choices=task_ids, value=task_ids[0])
516
+
517
+ with gr.Column(scale=8):
518
+ @gr.render(inputs=[benchmark_dd, model_dd, task_id_dd])
519
+ def render_trajectory(benchmark, agent, task_id):
520
+ traj_path = get_trajectory_path(base_traj_dir, benchmark, agent, task_id)
521
+ with open(traj_path, "rb") as f:
522
+ traj = orjson.loads(f.read())
523
+
524
+ goal = replace_string_content(traj["goal"])
525
+
526
+ gr.Textbox(label="Goal", value=goal, visible=True)
527
+
528
+ for step in traj["steps"]:
529
+ num = step["num"]
530
+ action = step["action"]
531
+ reasoning = step["reasoning"]
532
+ screenshot_path = step["screenshot_path"]
533
+
534
+ gr.Markdown(f"# Step {num}")
535
+ with gr.Group():
536
+ im = Image.open(screenshot_path)
537
+ im = apply_overlay_to_image(
538
+ im, step, highlevel_action_parser=hl_action_parser
539
+ )
540
+ format_ = "webp" if im.format is None else im.format
541
+ gr.Image(im, label="Screenshot", format=format_)
542
+ if reasoning is not None:
543
+ gr.Textbox(reasoning, label="Reasoning", lines=4)
544
+ if action is not None:
545
+ gr.Textbox(action, label="Action", lines=2)
546
+
547
+ # multi-choices dropdown for judges
548
+ judge_dd = gr.Dropdown(
549
+ label="Judges",
550
+ choices=list(judges_dict.values()),
551
+ multiselect=True,
552
+ value=default_judges,
553
+ )
554
 
555
+ @gr.render(inputs=[benchmark_dd, model_dd, task_id_dd, judge_dd])
556
+ def render_judge(benchmark, agent, task_id, judge_choices):
557
+ # load judgments
558
+ for judge in judges_dict.values():
559
+ if judge not in judge_choices:
560
+ continue
561
 
562
+ judgment_path = get_judgment_path(
563
+ base_judgments_dir, benchmark, agent, judge, task_id
 
 
 
564
  )
565
+ if not judgment_path.exists():
566
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
567
 
568
+ with open(judgment_path, "rb") as f:
569
+ judgment = orjson.loads(f.read())
570
+ if judge == "Rule-based":
571
+ msg = get_message_from_rule_based(judgment)
572
+ else:
573
+ msg = get_message_from_judgment(judgment)
574
 
575
+ gr.Textbox(label=judge, value=msg, lines=4)
576
 
577
  demo.launch()