xhluca
commited on
Commit
·
271f965
1
Parent(s):
28a9c15
add human annotation
Browse files- annotations.csv +0 -0
- demo.py +54 -0
annotations.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
demo.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
import ast
|
|
|
|
|
2 |
import pyparsing as pp
|
3 |
from dataclasses import dataclass
|
4 |
from typing import Any
|
@@ -474,16 +476,54 @@ def get_message_from_rule_based(judgment):
|
|
474 |
|
475 |
return output
|
476 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
477 |
|
478 |
base_traj_dir = "trajectories/cleaned"
|
479 |
base_screenshot_dir = "trajectories/screenshots"
|
480 |
base_judgments_dir = "trajectories/judgments"
|
|
|
481 |
|
482 |
base_traj_dir = Path(base_traj_dir)
|
483 |
base_screenshot_dir = Path(base_screenshot_dir)
|
484 |
|
485 |
hl_action_parser = _build_highlevel_action_parser()
|
486 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
487 |
with gr.Blocks(title="AgentRewardBench Demo") as demo:
|
488 |
gr.Markdown(
|
489 |
"""
|
@@ -552,6 +592,20 @@ with gr.Blocks(title="AgentRewardBench Demo") as demo:
|
|
552 |
value=default_judges,
|
553 |
)
|
554 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
555 |
@gr.render(inputs=[benchmark_dd, model_dd, task_id_dd, judge_dd])
|
556 |
def render_judge(benchmark, agent, task_id, judge_choices):
|
557 |
# load judgments
|
|
|
1 |
import ast
|
2 |
+
import csv
|
3 |
+
from textwrap import dedent
|
4 |
import pyparsing as pp
|
5 |
from dataclasses import dataclass
|
6 |
from typing import Any
|
|
|
476 |
|
477 |
return output
|
478 |
|
479 |
+
def records_to_dict(records, key_order: list = ['benchmark', 'model_name', 'task_id']):
|
480 |
+
"""
|
481 |
+
Convert a list of records to a nested dict, with key order
|
482 |
+
The depth of the dict is determined by the number of keys in key_order.
|
483 |
+
"""
|
484 |
+
|
485 |
+
result = {}
|
486 |
+
|
487 |
+
for record in records:
|
488 |
+
# get the keys in the order of key_order
|
489 |
+
keys = [record[key] for key in key_order]
|
490 |
+
# create a nested dict
|
491 |
+
d = result
|
492 |
+
for key in keys[:-1]:
|
493 |
+
if key not in d:
|
494 |
+
d[key] = {}
|
495 |
+
d = d[key]
|
496 |
+
# set the value
|
497 |
+
d[keys[-1]] = record
|
498 |
+
|
499 |
+
return result
|
500 |
+
|
501 |
+
def format_annotation(annotation):
|
502 |
+
annotation_str = dedent(f"""
|
503 |
+
Success: {annotation['trajectory_success']}
|
504 |
+
Side Effect: {annotation['trajectory_side_effect']}
|
505 |
+
Looping: {annotation['trajectory_looping']}
|
506 |
+
""")
|
507 |
+
return annotation_str.strip()
|
508 |
+
|
509 |
|
510 |
base_traj_dir = "trajectories/cleaned"
|
511 |
base_screenshot_dir = "trajectories/screenshots"
|
512 |
base_judgments_dir = "trajectories/judgments"
|
513 |
+
annotations_path = "./annotations.csv"
|
514 |
|
515 |
base_traj_dir = Path(base_traj_dir)
|
516 |
base_screenshot_dir = Path(base_screenshot_dir)
|
517 |
|
518 |
hl_action_parser = _build_highlevel_action_parser()
|
519 |
|
520 |
+
# load annotations as records via csv
|
521 |
+
with open(annotations_path, "r") as f:
|
522 |
+
annotations = list(csv.DictReader(f))
|
523 |
+
annotations_dict = records_to_dict(annotations, key_order=['benchmark', 'model_name', 'task_id'])
|
524 |
+
|
525 |
+
# convert the annotations to a dict, with key order
|
526 |
+
|
527 |
with gr.Blocks(title="AgentRewardBench Demo") as demo:
|
528 |
gr.Markdown(
|
529 |
"""
|
|
|
592 |
value=default_judges,
|
593 |
)
|
594 |
|
595 |
+
# get annotation for the task from annotations_dict
|
596 |
+
@gr.render(inputs=[benchmark_dd, model_dd, task_id_dd])
|
597 |
+
def render_annotation(benchmark, agent, task_id):
|
598 |
+
bench_full = benchmarks_inverse[benchmark]
|
599 |
+
agent_full = agents_inverse[agent]
|
600 |
+
task_full = tasks_dict[bench_full]
|
601 |
+
task_id_full = f"{task_full}.{task_id}"
|
602 |
+
# get the annotation
|
603 |
+
annotation = annotations_dict[bench_full][agent_full][task_id_full]
|
604 |
+
annotation_str = format_annotation(annotation)
|
605 |
+
|
606 |
+
gr.Textbox(label="Expert Annotation", value=annotation_str, lines=3)
|
607 |
+
|
608 |
+
|
609 |
@gr.render(inputs=[benchmark_dd, model_dd, task_id_dd, judge_dd])
|
610 |
def render_judge(benchmark, agent, task_id, judge_choices):
|
611 |
# load judgments
|