Spaces:

McGill-NLP
/

agent-reward-bench-demo

Running

App Files Files Community

xhluca commited on 19 days ago

Commit

271f965

1 Parent(s): 28a9c15

add human annotation

Browse files

Files changed (2) hide show

annotations.csv +0 -0
demo.py +54 -0

annotations.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

demo.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import ast
 import pyparsing as pp
 from dataclasses import dataclass
 from typing import Any
@@ -474,16 +476,54 @@ def get_message_from_rule_based(judgment):
     return output
 base_traj_dir = "trajectories/cleaned"
 base_screenshot_dir = "trajectories/screenshots"
 base_judgments_dir = "trajectories/judgments"
 base_traj_dir = Path(base_traj_dir)
 base_screenshot_dir = Path(base_screenshot_dir)
 hl_action_parser = _build_highlevel_action_parser()
 with gr.Blocks(title="AgentRewardBench Demo") as demo:
     gr.Markdown(
         """
@@ -552,6 +592,20 @@ with gr.Blocks(title="AgentRewardBench Demo") as demo:
                 value=default_judges,
             )
             @gr.render(inputs=[benchmark_dd, model_dd, task_id_dd, judge_dd])
             def render_judge(benchmark, agent, task_id, judge_choices):
                 # load judgments

 import ast
+import csv
+from textwrap import dedent
 import pyparsing as pp
 from dataclasses import dataclass
 from typing import Any
     return output
+def records_to_dict(records, key_order: list = ['benchmark', 'model_name', 'task_id']):
+    """
+    Convert a list of records to a nested dict, with key order
+    The depth of the dict is determined by the number of keys in key_order.
+    """
+    result = {}
+    for record in records:
+        # get the keys in the order of key_order
+        keys = [record[key] for key in key_order]
+        # create a nested dict
+        d = result
+        for key in keys[:-1]:
+            if key not in d:
+                d[key] = {}
+            d = d[key]
+        # set the value
+        d[keys[-1]] = record
+    return result
+def format_annotation(annotation):
+    annotation_str = dedent(f"""
+    Success: {annotation['trajectory_success']}
+    Side Effect: {annotation['trajectory_side_effect']}
+    Looping: {annotation['trajectory_looping']}
+    """)
+    return annotation_str.strip()
 base_traj_dir = "trajectories/cleaned"
 base_screenshot_dir = "trajectories/screenshots"
 base_judgments_dir = "trajectories/judgments"
+annotations_path = "./annotations.csv"
 base_traj_dir = Path(base_traj_dir)
 base_screenshot_dir = Path(base_screenshot_dir)
 hl_action_parser = _build_highlevel_action_parser()
+# load annotations as records via csv
+with open(annotations_path, "r") as f:
+    annotations = list(csv.DictReader(f))
+annotations_dict = records_to_dict(annotations, key_order=['benchmark', 'model_name', 'task_id'])
+# convert the annotations to a dict, with key order
 with gr.Blocks(title="AgentRewardBench Demo") as demo:
     gr.Markdown(
         """
                 value=default_judges,
             )
+            # get annotation for the task from annotations_dict
+            @gr.render(inputs=[benchmark_dd, model_dd, task_id_dd])
+            def render_annotation(benchmark, agent, task_id):
+                bench_full = benchmarks_inverse[benchmark]
+                agent_full = agents_inverse[agent]
+                task_full = tasks_dict[bench_full]
+                task_id_full = f"{task_full}.{task_id}"
+                # get the annotation
+                annotation = annotations_dict[bench_full][agent_full][task_id_full]
+                annotation_str = format_annotation(annotation)
+                gr.Textbox(label="Expert Annotation", value=annotation_str, lines=3)
             @gr.render(inputs=[benchmark_dd, model_dd, task_id_dd, judge_dd])
             def render_judge(benchmark, agent, task_id, judge_choices):
                 # load judgments