xhluca commited on
Commit
271f965
·
1 Parent(s): 28a9c15

add human annotation

Browse files
Files changed (2) hide show
  1. annotations.csv +0 -0
  2. demo.py +54 -0
annotations.csv ADDED
The diff for this file is too large to render. See raw diff
 
demo.py CHANGED
@@ -1,4 +1,6 @@
1
  import ast
 
 
2
  import pyparsing as pp
3
  from dataclasses import dataclass
4
  from typing import Any
@@ -474,16 +476,54 @@ def get_message_from_rule_based(judgment):
474
 
475
  return output
476
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
477
 
478
  base_traj_dir = "trajectories/cleaned"
479
  base_screenshot_dir = "trajectories/screenshots"
480
  base_judgments_dir = "trajectories/judgments"
 
481
 
482
  base_traj_dir = Path(base_traj_dir)
483
  base_screenshot_dir = Path(base_screenshot_dir)
484
 
485
  hl_action_parser = _build_highlevel_action_parser()
486
 
 
 
 
 
 
 
 
487
  with gr.Blocks(title="AgentRewardBench Demo") as demo:
488
  gr.Markdown(
489
  """
@@ -552,6 +592,20 @@ with gr.Blocks(title="AgentRewardBench Demo") as demo:
552
  value=default_judges,
553
  )
554
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
555
  @gr.render(inputs=[benchmark_dd, model_dd, task_id_dd, judge_dd])
556
  def render_judge(benchmark, agent, task_id, judge_choices):
557
  # load judgments
 
1
  import ast
2
+ import csv
3
+ from textwrap import dedent
4
  import pyparsing as pp
5
  from dataclasses import dataclass
6
  from typing import Any
 
476
 
477
  return output
478
 
479
+ def records_to_dict(records, key_order: list = ['benchmark', 'model_name', 'task_id']):
480
+ """
481
+ Convert a list of records to a nested dict, with key order
482
+ The depth of the dict is determined by the number of keys in key_order.
483
+ """
484
+
485
+ result = {}
486
+
487
+ for record in records:
488
+ # get the keys in the order of key_order
489
+ keys = [record[key] for key in key_order]
490
+ # create a nested dict
491
+ d = result
492
+ for key in keys[:-1]:
493
+ if key not in d:
494
+ d[key] = {}
495
+ d = d[key]
496
+ # set the value
497
+ d[keys[-1]] = record
498
+
499
+ return result
500
+
501
+ def format_annotation(annotation):
502
+ annotation_str = dedent(f"""
503
+ Success: {annotation['trajectory_success']}
504
+ Side Effect: {annotation['trajectory_side_effect']}
505
+ Looping: {annotation['trajectory_looping']}
506
+ """)
507
+ return annotation_str.strip()
508
+
509
 
510
  base_traj_dir = "trajectories/cleaned"
511
  base_screenshot_dir = "trajectories/screenshots"
512
  base_judgments_dir = "trajectories/judgments"
513
+ annotations_path = "./annotations.csv"
514
 
515
  base_traj_dir = Path(base_traj_dir)
516
  base_screenshot_dir = Path(base_screenshot_dir)
517
 
518
  hl_action_parser = _build_highlevel_action_parser()
519
 
520
+ # load annotations as records via csv
521
+ with open(annotations_path, "r") as f:
522
+ annotations = list(csv.DictReader(f))
523
+ annotations_dict = records_to_dict(annotations, key_order=['benchmark', 'model_name', 'task_id'])
524
+
525
+ # convert the annotations to a dict, with key order
526
+
527
  with gr.Blocks(title="AgentRewardBench Demo") as demo:
528
  gr.Markdown(
529
  """
 
592
  value=default_judges,
593
  )
594
 
595
+ # get annotation for the task from annotations_dict
596
+ @gr.render(inputs=[benchmark_dd, model_dd, task_id_dd])
597
+ def render_annotation(benchmark, agent, task_id):
598
+ bench_full = benchmarks_inverse[benchmark]
599
+ agent_full = agents_inverse[agent]
600
+ task_full = tasks_dict[bench_full]
601
+ task_id_full = f"{task_full}.{task_id}"
602
+ # get the annotation
603
+ annotation = annotations_dict[bench_full][agent_full][task_id_full]
604
+ annotation_str = format_annotation(annotation)
605
+
606
+ gr.Textbox(label="Expert Annotation", value=annotation_str, lines=3)
607
+
608
+
609
  @gr.render(inputs=[benchmark_dd, model_dd, task_id_dd, judge_dd])
610
  def render_judge(benchmark, agent, task_id, judge_choices):
611
  # load judgments