xhluca commited on
Commit
51c2a5b
·
1 Parent(s): fed44d5

add initial files

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. demo.py +560 -0
  3. requirements.txt +3 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ trajectories/
demo.py ADDED
@@ -0,0 +1,560 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import pyparsing as pp
3
+ from dataclasses import dataclass
4
+ from typing import Any
5
+ import json
6
+ from pathlib import Path
7
+ import logging
8
+
9
+ import orjson
10
+ from PIL import Image
11
+ import gradio as gr
12
+ import numpy as np
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ benchmarks_dict = {
17
+ "assistantbench": "AssistantBench",
18
+ "visualwebarena": "VisualWebArena",
19
+ "webarena": "WebArena",
20
+ "workarena": "WorkArena",
21
+ }
22
+
23
+ tasks_dict = {
24
+ "assistantbench": "assistantbench.improved.validation",
25
+ "visualwebarena": "visualwebarena.resized",
26
+ "webarena": "webarena",
27
+ "workarena": "workarena.servicenow",
28
+ }
29
+
30
+ agents_dict = {
31
+ "GenericAgent-anthropic_claude-3.7-sonnet": "Claude 3.7 Sonnet",
32
+ "GenericAgent-gpt-4o-2024-11-20": "GPT-4o",
33
+ "GenericAgent-meta-llama_Llama-3.3-70B-Instruct": "Llama-3.3 70B",
34
+ "GenericAgent-Qwen_Qwen2.5-VL-72B-Instruct": "Qwen2.5-VL 72B",
35
+ }
36
+
37
+ judges_dict = {
38
+ "aer": "AER-C",
39
+ "nnetnav": "NNetNav",
40
+ "claude-3.7-sonnet-noaxtree": "Claude 3.7 Sonnet (Screen)",
41
+ "claude-3.7-sonnet-noscreen": "Claude 3.7 Sonnet (Axtree)",
42
+ "gpt-4o-noaxtree": "GPT-4o (Screen)",
43
+ "gpt-4o-noscreen": "GPT-4o (Axtree)",
44
+ "qwen-2.5-vl-noaxtree": "Qwen 2.5 VL (Screen)",
45
+ "qwen-2.5-vl-noscreen": "Qwen 2.5 VL (Axtree)",
46
+ "llama-3.3-70b-noscreen": "Llama 3.3 70B",
47
+ "functional": "Rule-based",
48
+ }
49
+
50
+ default_judges = [
51
+ "AER-C",
52
+ "NNetNav",
53
+ "Claude 3.7 Sonnet (Screen)",
54
+ "GPT-4o (Screen)",
55
+ "Qwen 2.5 VL (Screen)",
56
+ "Llama 3.3 70B",
57
+ ]
58
+
59
+ benchmarks_inverse = {v: k for k, v in benchmarks_dict.items()}
60
+ agents_inverse = {v: k for k, v in agents_dict.items()}
61
+ tasks_inverse = {v: k for k, v in tasks_dict.items()}
62
+ judges_inverse = {v: k for k, v in judges_dict.items()}
63
+
64
+
65
+ @dataclass
66
+ class NamedArgument:
67
+ """
68
+ Source: https://github.com/ServiceNow/BrowserGym/blob/c3336ef61781ce39166ee6a9551dbfc8fac32ddc/browsergym/core/src/browsergym/core/action/parsers.py#L9
69
+ """
70
+
71
+ name: str
72
+ value: Any
73
+
74
+ def __repr__(self):
75
+ return f"{self.name}={repr(self.value)}"
76
+
77
+
78
+ def overlay_som(
79
+ screenshot: np.typing.ArrayLike,
80
+ extra_properties: dict,
81
+ fontsize: int = 12,
82
+ linewidth: int = 2,
83
+ tag_margin: int = 2,
84
+ ):
85
+ """
86
+ Source: https://github.com/ServiceNow/BrowserGym/blob/c3336ef61781ce39166ee6a9551dbfc8fac32ddc/browsergym/core/src/browsergym/utils/obs.py#L429
87
+ """
88
+ from PIL import Image, ImageDraw, ImageFont
89
+ import math
90
+
91
+ img = Image.fromarray(screenshot).copy() # make a copy
92
+ img = img.convert(mode="RGBA")
93
+ draw = ImageDraw.Draw(img)
94
+
95
+ font = ImageFont.load_default(size=fontsize)
96
+
97
+ # Adapted from https://stackoverflow.com/questions/51908563/dotted-or-dashed-line-with-python-pillow/58885306#58885306
98
+ def linedashed(
99
+ draw: ImageDraw.Draw,
100
+ x0,
101
+ y0,
102
+ x1,
103
+ y1,
104
+ fill,
105
+ width,
106
+ dash_length=4,
107
+ nodash_length=8,
108
+ ):
109
+ line_dx = x1 - x0 # delta x (can be negative)
110
+ line_dy = y1 - y0 # delta y (can be negative)
111
+ line_length = math.hypot(line_dx, line_dy) # line length (positive)
112
+ if line_length == 0:
113
+ return # Avoid division by zero in case the line length is 0
114
+ pixel_dx = line_dx / line_length # x add for 1px line length
115
+ pixel_dy = line_dy / line_length # y add for 1px line length
116
+ dash_start = 0
117
+ while dash_start < line_length:
118
+ dash_end = dash_start + dash_length
119
+ if dash_end > line_length:
120
+ dash_end = line_length
121
+ draw.line(
122
+ (
123
+ round(x0 + pixel_dx * dash_start),
124
+ round(y0 + pixel_dy * dash_start),
125
+ round(x0 + pixel_dx * dash_end),
126
+ round(y0 + pixel_dy * dash_end),
127
+ ),
128
+ fill=fill,
129
+ width=width,
130
+ )
131
+ dash_start += dash_length + nodash_length
132
+
133
+ for bid, properties in extra_properties.items():
134
+ if properties["set_of_marks"] and properties["bbox"]:
135
+ x, y, width, height = properties["bbox"]
136
+ x0, y0 = x, y
137
+ x1, y1 = x + width, y + height
138
+
139
+ # skip small boxes
140
+ area = (x1 - x0) * (y1 - y0)
141
+ if area < 20:
142
+ logger.warning(
143
+ f'som overlay: skipping bid "{bid}" due to bbox too small (area={area})'
144
+ )
145
+ continue
146
+
147
+ # draw bounding box with dashed lines
148
+ linedashed(draw, x0, y0, x1, y0, fill=(0, 0, 0, 255), width=linewidth)
149
+ linedashed(draw, x1, y0, x1, y1, fill=(0, 0, 0, 255), width=linewidth)
150
+ linedashed(draw, x1, y1, x0, y1, fill=(0, 0, 0, 255), width=linewidth)
151
+ linedashed(draw, x0, y1, x0, y0, fill=(0, 0, 0, 255), width=linewidth)
152
+
153
+ # get text box size (left, top, right, bottom)
154
+ tag_box = font.getbbox(
155
+ bid,
156
+ )
157
+
158
+ # set tag size, including margins
159
+ tag_size = (
160
+ (tag_box[2] - tag_box[0] + 2 * (tag_margin + 1)),
161
+ (tag_box[3] - tag_box[1] + 2 * (tag_margin + 1)),
162
+ )
163
+
164
+ # create tag image with correct size and black background
165
+ tag_img = Image.new("RGBA", tag_size, "black")
166
+ tag_draw = ImageDraw.Draw(tag_img)
167
+ # write text with 1px horizontal margin
168
+ tag_draw.text(
169
+ (-tag_box[0] + tag_margin + 1, -tag_box[1] + tag_margin + 1),
170
+ bid,
171
+ font=font,
172
+ fill=(255, 255, 255, 255),
173
+ spacing=0,
174
+ )
175
+ tag_draw.rectangle(
176
+ (0, 0, tag_size[0] - 1, tag_size[1] - 1),
177
+ fill=None,
178
+ outline=(255, 255, 255, 255),
179
+ width=1,
180
+ )
181
+
182
+ # draw tag in the source image, upper left of the bounding box
183
+ tag_pos = (x + 0, y - tag_size[1] / 2 + 4)
184
+ tag_pos = list(map(round, tag_pos))
185
+ img.paste(tag_img, tag_pos)
186
+
187
+ # convert to RGB (3 channels)
188
+ img = img.convert(mode="RGB")
189
+ # convert to a numpy array
190
+ img = np.array(img)
191
+
192
+ return img
193
+
194
+
195
+ def apply_overlay_to_image(im, step, highlevel_action_parser=None):
196
+ action = step.get("action", None)
197
+ if action is None:
198
+ return im
199
+
200
+ # get the element from the action string
201
+ element = get_element_from_action_str(
202
+ action, highlevel_action_parser=highlevel_action_parser
203
+ )
204
+ if element is None:
205
+ return im
206
+
207
+ # overlay the extra properties on the image
208
+ extra_properties = step.get("extra_element_properties", {})
209
+ if element not in extra_properties:
210
+ return im
211
+
212
+ # get the extra properties for the element
213
+ extra_properties = {element: extra_properties[element]}
214
+
215
+ im_arr = np.array(im)
216
+ im_overlayed = overlay_som(im_arr, extra_properties=extra_properties)
217
+ im = Image.fromarray(im_overlayed)
218
+
219
+ return im
220
+
221
+
222
+ def _build_highlevel_action_parser() -> pp.ParserElement:
223
+ """
224
+ SOURCE: https://github.com/ServiceNow/BrowserGym/blob/c3336ef61781ce39166ee6a9551dbfc8fac32ddc/browsergym/core/src/browsergym/core/action/parsers.py#L17
225
+ ---------------
226
+
227
+ Returns:
228
+ An action parser that accepts Python-like function calls with string, number, list or dict literals as arguments.
229
+ Example:
230
+ func("a", 42, None, True, [2, 4, "s"], {"a_key": "a_value"}, )
231
+ The parser is loose and accepts multi-line or single-line combinations af calls.
232
+ Example:
233
+ func() func()
234
+ \tfunc()
235
+ Python comments are ignored.
236
+ Example:
237
+ # this is a comment
238
+ func() # this function call will be parsed
239
+ # func() # this one will not
240
+ The parser will return a list of (function_name, function_args) tuples, one for each function call in the input.
241
+ The parser will raise exceptions
242
+
243
+ """
244
+
245
+ def make_keyword(kwd_str, kwd_value):
246
+ return pp.Keyword(kwd_str).set_parse_action(pp.replace_with(kwd_value))
247
+
248
+ TRUE = make_keyword("True", True)
249
+ FALSE = make_keyword("False", False)
250
+ NONE = make_keyword("None", None)
251
+
252
+ LBRACK, RBRACK, LBRACE, RBRACE, LPAREN, RPAREN, COLON = map(pp.Suppress, "[]{}():")
253
+
254
+ def literal_eval(toks):
255
+ return ast.literal_eval(toks[0])
256
+
257
+ string = pp.python_quoted_string().set_parse_action(literal_eval)
258
+ number = pp.pyparsing_common.number()
259
+ dict = pp.Forward().set_name("dict") # will be defined later
260
+ list = pp.Forward().set_name("list") # will be defined later
261
+ _tuple = pp.Forward().set_name("tuple") # will be defined later
262
+ element = (string | number | dict | list | _tuple | TRUE | FALSE | NONE).set_name(
263
+ "element"
264
+ )
265
+
266
+ list_items = pp.DelimitedList(element, allow_trailing_delim=True).set_name(None)
267
+ list << pp.Group(LBRACK + pp.Optional(list_items) + RBRACK, aslist=True)
268
+ _tuple << pp.Group(
269
+ LPAREN + pp.Optional(list_items) + RPAREN, aslist=True
270
+ ).set_parse_action(lambda tokens: tuple(tokens[0]))
271
+
272
+ dict_item = pp.Group(string + COLON + element, aslist=True).set_name("dict item")
273
+ dict_items = pp.DelimitedList(dict_item, allow_trailing_delim=True).set_name(None)
274
+ dict << pp.Dict(LBRACE + pp.Optional(dict_items) + RBRACE, asdict=True)
275
+
276
+ arg = element
277
+ list_args = pp.DelimitedList(arg, allow_trailing_delim=True).set_name(None)
278
+ named_arg = (
279
+ pp.pyparsing_common.identifier() + pp.Literal("=") + element
280
+ ).set_parse_action(lambda tokens: NamedArgument(name=tokens[0], value=tokens[2]))
281
+ list_named_args = pp.DelimitedList(named_arg, allow_trailing_delim=True).set_name(
282
+ None
283
+ )
284
+ function_call = pp.pyparsing_common.identifier() + pp.Group(
285
+ LPAREN + pp.Optional(list_args) + pp.Optional(list_named_args) + RPAREN,
286
+ aslist=True,
287
+ )
288
+
289
+ multiple_function_calls = pp.DelimitedList(pp.Group(function_call), delim="")
290
+ multiple_function_calls.ignore(pp.python_style_comment())
291
+
292
+ parser = multiple_function_calls
293
+
294
+ return parser
295
+
296
+
297
+ def replace_string_content(s, start="https://", end=".png", replacement="<URL>"):
298
+ # erase everything between start and end
299
+ # example: https://www.example.com/image.png
300
+ # becomes: replaced
301
+
302
+ # find the start and end indices
303
+ start_index = s.find(start)
304
+ end_index = s.find(end, start_index) + len(end)
305
+ if start_index == -1 or end_index == -1:
306
+ return s
307
+ # replace the content
308
+ return s[:start_index] + replacement + s[end_index:]
309
+
310
+
311
+ def infer_task_name(base_traj_dir, benchmark, agent):
312
+ agent_full = agents_inverse[agent]
313
+ benchmark_full = benchmarks_inverse[benchmark]
314
+ traj_dir = Path(
315
+ base_traj_dir,
316
+ benchmark_full,
317
+ agent_full,
318
+ f"{agent_full}_on_{benchmark_full}",
319
+ )
320
+ traj_dir = traj_dir.resolve()
321
+ if not traj_dir.exists():
322
+ raise FileNotFoundError(f"Trajectory directory not found: {traj_dir}")
323
+ # get one json file in the directory
324
+ json_files = list(traj_dir.glob("*.json"))
325
+ if not json_files:
326
+ raise FileNotFoundError(f"No JSON files found in: {traj_dir}")
327
+
328
+ # get the first json file
329
+ json_file = json_files[0]
330
+ # task_name is the part of the filename before the last dot
331
+ task_name = json_file.stem.split(".")[:-1]
332
+ # join the task name with the benchmark name
333
+ task_name = ".".join(task_name)
334
+
335
+ return task_name
336
+
337
+
338
+ def get_element_from_action_str(action_str, highlevel_action_parser=None):
339
+ import pyparsing
340
+
341
+ if highlevel_action_parser is not None:
342
+ highlevel_action_parser = _build_highlevel_action_parser()
343
+
344
+ try:
345
+ function_calls = highlevel_action_parser.parse_string(
346
+ action_str, parse_all=True
347
+ )
348
+ action_function, action_args = function_calls[0]
349
+ except pyparsing.exceptions.ParseException:
350
+ action_function = "UNKNOWN"
351
+ action_args = []
352
+
353
+ if len(action_args) > 0:
354
+ # first argument is the element
355
+ element = action_args[0]
356
+ else:
357
+ element = None
358
+
359
+ return element
360
+
361
+
362
+ def get_trajectory_path(base_traj_dir, benchmark, agent, task_id):
363
+ agent_full = agents_inverse[agent]
364
+ benchmark_full = benchmarks_inverse[benchmark]
365
+ task_full = tasks_dict[benchmark_full]
366
+
367
+ traj_path = Path(
368
+ base_traj_dir,
369
+ benchmark_full,
370
+ agent_full,
371
+ f"{agent_full}_on_{task_full}",
372
+ f"{task_full}.{task_id}.json",
373
+ )
374
+ traj_path = traj_path.resolve()
375
+
376
+ if not traj_path.exists():
377
+ raise FileNotFoundError(f"Trajectory file not found: {traj_path}")
378
+ return traj_path
379
+
380
+
381
+ def get_judgment_path(base_judgments_dir, benchmark, agent, judge, task_id):
382
+ agent_full = agents_inverse[agent]
383
+ benchmark_full = benchmarks_inverse[benchmark]
384
+ task_full = tasks_dict[benchmark_full]
385
+ judge_full = judges_inverse[judge]
386
+
387
+ judgment_path = Path(
388
+ base_judgments_dir,
389
+ benchmark_full,
390
+ agent_full,
391
+ judge_full,
392
+ f"{task_full}.{task_id}.json",
393
+ )
394
+ judgment_path = judgment_path.resolve()
395
+
396
+ if not judgment_path.exists():
397
+ raise FileNotFoundError(f"Judgment file not found: {judgment_path}")
398
+
399
+ return judgment_path
400
+
401
+
402
+ def list_benchmarks():
403
+ return list(benchmarks_dict.values())
404
+
405
+
406
+ def list_agents(base_traj_dir, benchmark):
407
+ # show only the agents that are in the base_traj_dir
408
+ benchmark_full = benchmarks_inverse[benchmark]
409
+ traj_dir = Path(base_traj_dir, benchmark_full)
410
+ traj_dir = traj_dir.resolve()
411
+ if not traj_dir.exists():
412
+ raise FileNotFoundError(f"Trajectory directory not found: {traj_dir}")
413
+ # list all dirs that are not hidden
414
+ subdirs = [
415
+ f for f in traj_dir.iterdir() if f.is_dir() and not f.name.startswith(".")
416
+ ]
417
+ agent_names = [agents_dict[s.name] for s in subdirs if s.name in agents_dict]
418
+
419
+ # sort the agent names
420
+ agent_names.sort()
421
+
422
+ return agent_names
423
+
424
+
425
+ def list_task_ids(base_traj_dir, benchmark, agent):
426
+ # example: trajectories/cleaned/workarena/GenericAgent-anthropic_claude-3.7-sonnet/GenericAgent-anthropic_claude-3.7-sonnet_on_workarena.servicenow
427
+ agent_full = agents_inverse[agent]
428
+ benchmark_full = benchmarks_inverse[benchmark]
429
+ task_full = tasks_dict[benchmark_full]
430
+
431
+ traj_dir = Path(
432
+ base_traj_dir,
433
+ benchmark_full,
434
+ agent_full,
435
+ f"{agent_full}_on_{task_full}",
436
+ )
437
+ traj_dir = traj_dir.resolve()
438
+
439
+ if not traj_dir.exists():
440
+ raise FileNotFoundError(f"Trajectory directory not found: {traj_dir}")
441
+
442
+ task_ids = [f.stem.split(".")[-1] for f in traj_dir.glob("*.json")]
443
+
444
+ # sort as integer if possible, otherwise as string
445
+ task_ids.sort(key=lambda x: int(x) if x.isdigit() else x)
446
+
447
+ return task_ids
448
+
449
+
450
+ def get_message_from_judgment(judgment):
451
+ try:
452
+ output = judgment['response']['choices'][0]['message']['content']
453
+ except:
454
+ output = "No judgment found"
455
+ return output
456
+
457
+ def get_message_from_rule_based(judgment):
458
+ try:
459
+ r = judgment['trajectory_info']['summary_info']['cum_reward']
460
+ output = "Success" if r > 0.5 else "Failure"
461
+ except:
462
+ output = "No judgment found"
463
+
464
+ return output
465
+
466
+
467
+ base_traj_dir = "trajectories/cleaned"
468
+ base_screenshot_dir = "trajectories/screenshots"
469
+ base_judgments_dir = "trajectories/judgments"
470
+
471
+ base_traj_dir = Path(base_traj_dir)
472
+ base_screenshot_dir = Path(base_screenshot_dir)
473
+
474
+ hl_action_parser = _build_highlevel_action_parser()
475
+
476
+ with gr.Blocks(title="AgentRewardBench Demo") as demo, gr.Row():
477
+ with gr.Column(scale=4):
478
+ benchmark_default = "WebArena"
479
+ benchmark_dd = gr.Dropdown(
480
+ label="Benchmark", choices=list_benchmarks(), value=benchmark_default
481
+ )
482
+
483
+ agents = list_agents(base_traj_dir, benchmark_default)
484
+ model_dd = gr.Dropdown(label="Agent", choices=agents, value=agents[0])
485
+
486
+ task_ids = list_task_ids(base_traj_dir, benchmark_default, agents[0])
487
+ task_id_dd = gr.Dropdown(label="Task ID", choices=task_ids, value=task_ids[0])
488
+
489
+ @benchmark_dd.change(inputs=[benchmark_dd], outputs=[model_dd])
490
+ def update_agents(benchmark):
491
+ agents = list_agents(base_traj_dir, benchmark)
492
+ return gr.Dropdown(label="Agent", choices=agents, value=agents[0])
493
+
494
+ @model_dd.change(inputs=[benchmark_dd, model_dd], outputs=[task_id_dd])
495
+ def update_task_ids(benchmark, agent):
496
+ task_ids = list_task_ids(base_traj_dir, benchmark, agent)
497
+
498
+ return gr.Dropdown(choices=task_ids, value=task_ids[0])
499
+
500
+ with gr.Column(scale=8):
501
+ @gr.render(inputs=[benchmark_dd, model_dd, task_id_dd])
502
+ def render_trajectory(benchmark, agent, task_id):
503
+ traj_path = get_trajectory_path(base_traj_dir, benchmark, agent, task_id)
504
+ with open(traj_path, "rb") as f:
505
+ traj = orjson.loads(f.read())
506
+
507
+ goal = replace_string_content(traj["goal"])
508
+
509
+ gr.Textbox(label="Goal", value=goal, visible=True)
510
+
511
+ for step in traj["steps"]:
512
+ num = step["num"]
513
+ action = step["action"]
514
+ reasoning = step["reasoning"]
515
+ screenshot_path = step["screenshot_path"]
516
+
517
+ gr.Markdown(f"# Step {num}")
518
+ with gr.Group():
519
+ im = Image.open(screenshot_path)
520
+ im = apply_overlay_to_image(
521
+ im, step, highlevel_action_parser=hl_action_parser
522
+ )
523
+ format_ = "webp" if im.format is None else im.format
524
+ gr.Image(im, label="Screenshot", format=format_)
525
+ if reasoning is not None:
526
+ gr.Textbox(reasoning, label="Reasoning", lines=4)
527
+ if action is not None:
528
+ gr.Textbox(action, label="Action", lines=2)
529
+
530
+ # multi-choices dropdown for judges
531
+ judge_dd = gr.Dropdown(
532
+ label="Judges",
533
+ choices=list(judges_dict.values()),
534
+ multiselect=True,
535
+ value=default_judges,
536
+ )
537
+
538
+ @gr.render(inputs=[benchmark_dd, model_dd, task_id_dd, judge_dd])
539
+ def render_judge(benchmark, agent, task_id, judge_choices):
540
+ # load judgments
541
+ for judge in judges_dict.values():
542
+ if judge not in judge_choices:
543
+ continue
544
+
545
+ judgment_path = get_judgment_path(
546
+ base_judgments_dir, benchmark, agent, judge, task_id
547
+ )
548
+ if not judgment_path.exists():
549
+ continue
550
+
551
+ with open(judgment_path, "rb") as f:
552
+ judgment = orjson.loads(f.read())
553
+ if judge == "Rule-based":
554
+ msg = get_message_from_rule_based(judgment)
555
+ else:
556
+ msg = get_message_from_judgment(judgment)
557
+
558
+ gr.Textbox(label=judge, value=msg, lines=4)
559
+
560
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ tqdm
2
+ orjson
3
+ Pillow