M-Rique commited on
Commit
8770774
·
1 Parent(s): 64b82de

Add initial images and format

Browse files
Files changed (5) hide show
  1. app.py +160 -100
  2. e2bqwen.py +53 -28
  3. eval.py +146 -84
  4. model_replay.py +16 -21
  5. show_eval.py +90 -71
app.py CHANGED
@@ -2,25 +2,23 @@ import gradio as gr
2
  import os
3
  import json
4
  import shutil
5
- import traceback
6
  import uuid
7
- from textwrap import dedent
8
  import time
9
  from threading import Timer
10
  from huggingface_hub import upload_folder, login
11
  from e2b_desktop import Sandbox
 
 
 
 
12
 
13
- from smolagents import CodeAgent, OpenAIServerModel
14
- from smolagents.monitoring import LogLevel
15
  from smolagents.gradio_ui import GradioUI, stream_to_gradio
16
- from model_replay import FakeModelReplayLog
17
- from gradio_modal import Modal
18
 
19
- from dotenv import load_dotenv
20
 
21
  load_dotenv(override=True)
22
 
23
- from e2bqwen import QwenVLAPIModel, E2BVisionAgent
24
 
25
  E2B_API_KEY = os.getenv("E2B_API_KEY")
26
  SANDBOXES = {}
@@ -28,11 +26,11 @@ SANDBOX_METADATA = {}
28
  SANDBOX_TIMEOUT = 600
29
  WIDTH = 1024
30
  HEIGHT = 768
31
- TMP_DIR = './tmp/'
32
  if not os.path.exists(TMP_DIR):
33
  os.makedirs(TMP_DIR)
34
 
35
- hf_token = os.getenv("HF_TOKEN")
36
  login(token=hf_token)
37
 
38
  custom_css = """
@@ -152,9 +150,9 @@ custom_css = """
152
  .logo-item:hover {
153
  color: #935f06!important;
154
  }
155
- """.replace("<<WIDTH>>", str(WIDTH+15)).replace("<<HEIGHT>>", str(HEIGHT+10))
156
 
157
- footer_html="""
158
  <h3 style="text-align: center; margin-top:50px;"><i>Powered by open source:</i></h2>
159
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
160
  <div class="logo-container">
@@ -182,7 +180,7 @@ sandbox_html_template = """
182
  <img src="https://huggingface.co/datasets/mfarre/servedfiles/resolve/main/blue_screen_of_death.gif" class="bsod-image" style="display: none;"/>
183
  <img src="https://huggingface.co/datasets/m-ric/images/resolve/main/HUD_thom.png" class="sandbox-frame" />
184
  </div>
185
- """.replace("<<WIDTH>>", str(WIDTH+15)).replace("<<HEIGHT>>", str(HEIGHT+10))
186
 
187
  custom_js = """function() {
188
  document.body.classList.add('dark');
@@ -301,13 +299,12 @@ custom_js = """function() {
301
  }
302
  """
303
 
304
-
305
- def upload_to_hf_and_remove(folder_path):
306
 
307
- repo_id = "smolagents/computer-agent-logs"
 
308
  try:
309
  folder_name = os.path.basename(os.path.normpath(folder_path))
310
-
311
  # Upload the folder to Huggingface
312
  print(f"Uploading {folder_path} to {repo_id}/{folder_name}...")
313
  url = upload_folder(
@@ -315,29 +312,30 @@ def upload_to_hf_and_remove(folder_path):
315
  repo_id=repo_id,
316
  repo_type="dataset",
317
  path_in_repo=folder_name,
318
- ignore_patterns=[".git/*", ".gitignore"]
319
  )
320
-
321
  # Remove the local folder after successful upload
322
  print(f"Upload complete. Removing local folder {folder_path}...")
323
  shutil.rmtree(folder_path)
324
  print("Local folder removed successfully.")
325
-
326
  return url
327
-
328
  except Exception as e:
329
  print(f"Error during upload or cleanup: {str(e)}")
330
  raise
331
 
 
332
  def cleanup_sandboxes():
333
  """Remove sandboxes that haven't been accessed for more than 5 minutes"""
334
  current_time = time.time()
335
  sandboxes_to_remove = []
336
-
337
  for session_id, metadata in SANDBOX_METADATA.items():
338
- if current_time - metadata['last_accessed'] > SANDBOX_TIMEOUT:
339
  sandboxes_to_remove.append(session_id)
340
-
341
  for session_id in sandboxes_to_remove:
342
  if session_id in SANDBOXES:
343
  try:
@@ -345,7 +343,7 @@ def cleanup_sandboxes():
345
  data_dir = os.path.join(TMP_DIR, session_id)
346
  if os.path.exists(data_dir):
347
  upload_to_hf_and_remove(data_dir)
348
-
349
  # Close the sandbox
350
  SANDBOXES[session_id].kill()
351
  del SANDBOXES[session_id]
@@ -354,14 +352,18 @@ def cleanup_sandboxes():
354
  except Exception as e:
355
  print(f"Error cleaning up sandbox {session_id}: {str(e)}")
356
 
 
357
  def get_or_create_sandbox(session_uuid):
358
  current_time = time.time()
359
 
360
- if (session_uuid in SANDBOXES and
361
- session_uuid in SANDBOX_METADATA and
362
- current_time - SANDBOX_METADATA[session_uuid]['created_at'] < SANDBOX_TIMEOUT):
 
 
 
363
  print(f"Reusing Sandbox for {session_uuid}")
364
- SANDBOX_METADATA[session_uuid]['last_accessed'] = current_time
365
  return SANDBOXES[session_uuid]
366
 
367
  if session_uuid in SANDBOXES:
@@ -372,27 +374,38 @@ def get_or_create_sandbox(session_uuid):
372
  print(f"Error closing expired sandbox: {str(e)}")
373
 
374
  print(f"Creating new sandbox for session {session_uuid}")
375
- desktop = Sandbox(api_key=E2B_API_KEY, resolution=(WIDTH, HEIGHT), dpi=96, timeout=SANDBOX_TIMEOUT)
 
 
 
 
 
 
376
  desktop.stream.start(require_auth=True)
377
  setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
378
  desktop.commands.run(setup_cmd)
379
 
380
  SANDBOXES[session_uuid] = desktop
381
  SANDBOX_METADATA[session_uuid] = {
382
- 'created_at': current_time,
383
- 'last_accessed': current_time
384
  }
385
  return desktop
386
 
 
387
  def update_html(interactive_mode: bool, session_uuid):
388
  desktop = get_or_create_sandbox(session_uuid)
389
  auth_key = desktop.stream.get_auth_key()
390
  base_url = desktop.stream.get_url(auth_key=auth_key)
391
  stream_url = base_url if interactive_mode else f"{base_url}&view_only=true"
392
-
393
  status_class = "status-interactive" if interactive_mode else "status-view-only"
394
  status_text = "Interactive" if interactive_mode else "Agent running..."
395
- creation_time = SANDBOX_METADATA[session_uuid]['created_at'] if session_uuid in SANDBOX_METADATA else time.time()
 
 
 
 
396
 
397
  sandbox_html_content = sandbox_html_template.format(
398
  stream_url=stream_url,
@@ -406,24 +419,27 @@ def update_html(interactive_mode: bool, session_uuid):
406
  def generate_interaction_id(session_uuid):
407
  return f"{session_uuid}_{int(time.time())}"
408
 
 
409
  def chat_message_to_json(obj):
410
  """Custom JSON serializer for ChatMessage and related objects"""
411
- if hasattr(obj, '__dict__'):
412
  # Create a copy of the object's __dict__ to avoid modifying the original
413
  result = obj.__dict__.copy()
414
-
415
  # Remove the 'raw' field which may contain non-serializable data
416
- if 'raw' in result:
417
- del result['raw']
418
-
419
  # Process the content or tool_calls if they exist
420
- if 'content' in result and result['content'] is not None:
421
- if hasattr(result['content'], '__dict__'):
422
- result['content'] = chat_message_to_json(result['content'])
423
-
424
- if 'tool_calls' in result and result['tool_calls'] is not None:
425
- result['tool_calls'] = [chat_message_to_json(tc) for tc in result['tool_calls']]
426
-
 
 
427
  return result
428
  elif isinstance(obj, (list, tuple)):
429
  return [chat_message_to_json(item) for item in obj]
@@ -431,16 +447,23 @@ def chat_message_to_json(obj):
431
  return obj
432
 
433
 
434
- def save_final_status(folder, status: str, summary, error_message = None) -> None:
435
  metadata_path = os.path.join(folder, "metadata.json")
436
  output_file = open(metadata_path, "w")
437
- output_file.write(json.dumps({"status":status, "summary":summary, "error_message": error_message}, default=chat_message_to_json))
 
 
 
 
 
438
  output_file.close()
439
 
 
440
  def extract_browser_uuid(js_uuid):
441
  print(f"[BROWSER] Got browser UUID from JS: {js_uuid}")
442
  return js_uuid
443
 
 
444
  def initialize_session(request: gr.Request, interactive_mode, browser_uuid):
445
  if not browser_uuid:
446
  new_uuid = str(uuid.uuid4())
@@ -454,7 +477,7 @@ def initialize_session(request: gr.Request, interactive_mode, browser_uuid):
454
  def create_agent(data_dir, desktop):
455
  model = QwenVLAPIModel(
456
  model_id="Qwen/Qwen2.5-VL-72B-Instruct",
457
- hf_token = hf_token,
458
  )
459
 
460
  # model = OpenAIServerModel(
@@ -467,15 +490,17 @@ def create_agent(data_dir, desktop):
467
  max_steps=200,
468
  verbosity_level=2,
469
  # planning_interval=10,
470
- use_v1_prompt=True
471
  )
472
 
 
473
  def get_agent_summary_erase_images(agent):
474
  for memory_step in agent.memory.steps:
475
  if getattr(memory_step, "observations_images", None):
476
  memory_step.observations_images = None
477
  return agent.memory.get_succinct_steps()
478
 
 
479
  class EnrichedGradioUI(GradioUI):
480
  def log_user_message(self, text_input):
481
  import gradio as gr
@@ -485,7 +510,15 @@ class EnrichedGradioUI(GradioUI):
485
  gr.Button(interactive=False),
486
  )
487
 
488
- def interact_with_agent(self, task_input, stored_messages, session_state, session_uuid, consent_storage, request: gr.Request):
 
 
 
 
 
 
 
 
489
  interaction_id = generate_interaction_id(session_uuid)
490
  desktop = get_or_create_sandbox(session_uuid)
491
 
@@ -502,12 +535,30 @@ class EnrichedGradioUI(GradioUI):
502
  stored_messages.append(gr.ChatMessage(role="user", content=task_input))
503
  yield stored_messages
504
 
505
- for msg in stream_to_gradio(session_state["agent"], task=task_input, reset_agent_memory=False):
506
- if hasattr(session_state["agent"], "last_marked_screenshot") and msg.content == "-----": # Append the last screenshot before the end of step
507
- stored_messages.append(gr.ChatMessage(
508
- role="assistant",
509
- content={"path": session_state["agent"].last_marked_screenshot.to_string(), "mime_type": "image/png"},
510
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511
  stored_messages.append(msg)
512
  yield stored_messages
513
 
@@ -516,37 +567,44 @@ class EnrichedGradioUI(GradioUI):
516
  # summary = get_agent_summary_erase_images(session_state["agent"])
517
  # save_final_status(data_dir, "completed", summary = summary)
518
  yield stored_messages
519
-
520
  except Exception as e:
521
- error_message=f"Error in interaction: {str(e)}"
522
  raise e
523
  print(error_message)
524
- stored_messages.append(gr.ChatMessage(role="assistant", content="Run failed:\n" + error_message))
 
 
 
 
525
  if consent_storage:
526
  summary = get_agent_summary_erase_images(session_state["agent"])
527
- save_final_status(data_dir, "failed", summary=summary, error_message=error_message)
 
 
528
  yield stored_messages
529
  finally:
530
  if consent_storage:
531
  upload_to_hf_and_remove(data_dir)
532
 
533
- theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
 
 
 
534
 
535
  # Create a Gradio app with Blocks
536
  with gr.Blocks(theme=theme, css=custom_css, js=custom_js) as demo:
537
- #Storing session hash in a state variable
538
  session_uuid_state = gr.State(None)
539
 
540
-
541
-
542
  with gr.Row():
543
  sandbox_html = gr.HTML(
544
  value=sandbox_html_template.format(
545
  stream_url="",
546
  status_class="status-interactive",
547
- status_text="Interactive"
548
  ),
549
- label="Output"
550
  )
551
  with gr.Sidebar(position="left"):
552
  with Modal(visible=True) as modal:
@@ -560,7 +618,7 @@ _Please note that we store the task logs by default so **do not write any person
560
  task_input = gr.Textbox(
561
  value="Find me pictures of cute puppies",
562
  label="Enter your task below:",
563
- elem_classes="primary-color-label"
564
  )
565
 
566
  run_btn = gr.Button("Let's go!", variant="primary")
@@ -575,9 +633,9 @@ _Please note that we store the task logs by default so **do not write any person
575
  "Go on the Hugging Face Hub, find the space for FLUX1.dev, then generate a picture of the Golden Gate bridge",
576
  "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
577
  ],
578
- inputs = task_input,
579
- label= "Example Tasks",
580
- examples_per_page=4
581
  )
582
 
583
  session_state = gr.State({})
@@ -585,7 +643,9 @@ _Please note that we store the task logs by default so **do not write any person
585
 
586
  minimalist_toggle = gr.Checkbox(label="Innie/Outie", value=False)
587
 
588
- consent_storage = gr.Checkbox(label="Store task and agent trace?", value=True)
 
 
589
 
590
  def apply_theme(minimalist_mode: bool):
591
  if not minimalist_mode:
@@ -631,16 +691,10 @@ _Please note that we store the task logs by default so **do not write any person
631
  # Hidden HTML element to inject CSS dynamically
632
  theme_styles = gr.HTML(apply_theme(False), visible=False)
633
  minimalist_toggle.change(
634
- fn=apply_theme,
635
- inputs=[minimalist_toggle],
636
- outputs=[theme_styles]
637
- )
638
-
639
- footer = gr.HTML(
640
- value=footer_html,
641
- label="Header"
642
  )
643
 
 
644
 
645
  chatbot_display = gr.Chatbot(
646
  elem_id="chatbot",
@@ -653,7 +707,9 @@ _Please note that we store the task logs by default so **do not write any person
653
  resizable=True,
654
  )
655
 
656
- agent_ui = EnrichedGradioUI(CodeAgent(tools=[], model=None, name="ok", description="ok"))
 
 
657
 
658
  stop_btn = gr.Button("Stop the agent!", variant="huggingface")
659
 
@@ -664,9 +720,9 @@ _Please note that we store the task logs by default so **do not write any person
664
 
665
  if not os.path.exists(log_file):
666
  return "Waiting for machine from the future to boot..."
667
-
668
  try:
669
- with open(log_file, 'r') as f:
670
  lines = f.readlines()
671
  return "".join(lines[-tail:] if len(lines) > tail else lines)
672
  except Exception as e:
@@ -685,21 +741,25 @@ _Please note that we store the task logs by default so **do not write any person
685
  is_interactive = gr.Checkbox(value=True, visible=False)
686
 
687
  # Chain the events
688
- run_event = run_btn.click(
689
- fn=clear_and_set_view_only,
690
- inputs=[task_input, session_uuid_state],
691
- outputs=[sandbox_html]
692
- ).then(
693
- agent_ui.interact_with_agent,
694
- inputs=[task_input, stored_messages, session_state, session_uuid_state, consent_storage],
695
- outputs=[chatbot_display]
696
- ).then(
697
- fn=set_interactive,
698
- inputs=[session_uuid_state],
699
- outputs=[sandbox_html]
700
- ).then(
701
- fn=reactivate_stop_btn,
702
- outputs=[stop_btn]
 
 
 
 
703
  )
704
 
705
  def interrupt_agent(session_state):
@@ -716,7 +776,7 @@ _Please note that we store the task logs by default so **do not write any person
716
 
717
  # replay_btn.click(
718
  # fn=clear_and_set_view_only,
719
- # inputs=[task_input],
720
  # outputs=[sandbox_html]
721
  # ).then(
722
  # set_logs_source,
@@ -744,4 +804,4 @@ _Please note that we store the task logs by default so **do not write any person
744
  # Launch the app
745
  if __name__ == "__main__":
746
  Timer(60, cleanup_sandboxes).start() # Run every minute
747
- demo.launch()
 
2
  import os
3
  import json
4
  import shutil
 
5
  import uuid
 
6
  import time
7
  from threading import Timer
8
  from huggingface_hub import upload_folder, login
9
  from e2b_desktop import Sandbox
10
+ from gradio_modal import Modal
11
+ from io import BytesIO
12
+ from PIL import Image
13
+ from dotenv import load_dotenv
14
 
15
+ from smolagents import CodeAgent
 
16
  from smolagents.gradio_ui import GradioUI, stream_to_gradio
 
 
17
 
18
+ from e2bqwen import QwenVLAPIModel, E2BVisionAgent
19
 
20
  load_dotenv(override=True)
21
 
 
22
 
23
  E2B_API_KEY = os.getenv("E2B_API_KEY")
24
  SANDBOXES = {}
 
26
  SANDBOX_TIMEOUT = 600
27
  WIDTH = 1024
28
  HEIGHT = 768
29
+ TMP_DIR = "./tmp/"
30
  if not os.path.exists(TMP_DIR):
31
  os.makedirs(TMP_DIR)
32
 
33
+ hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
34
  login(token=hf_token)
35
 
36
  custom_css = """
 
150
  .logo-item:hover {
151
  color: #935f06!important;
152
  }
153
+ """.replace("<<WIDTH>>", str(WIDTH + 15)).replace("<<HEIGHT>>", str(HEIGHT + 10))
154
 
155
+ footer_html = """
156
  <h3 style="text-align: center; margin-top:50px;"><i>Powered by open source:</i></h2>
157
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
158
  <div class="logo-container">
 
180
  <img src="https://huggingface.co/datasets/mfarre/servedfiles/resolve/main/blue_screen_of_death.gif" class="bsod-image" style="display: none;"/>
181
  <img src="https://huggingface.co/datasets/m-ric/images/resolve/main/HUD_thom.png" class="sandbox-frame" />
182
  </div>
183
+ """.replace("<<WIDTH>>", str(WIDTH + 15)).replace("<<HEIGHT>>", str(HEIGHT + 10))
184
 
185
  custom_js = """function() {
186
  document.body.classList.add('dark');
 
299
  }
300
  """
301
 
 
 
302
 
303
+ def upload_to_hf_and_remove(folder_path):
304
+ repo_id = "smolagents/computer-agent-logs"
305
  try:
306
  folder_name = os.path.basename(os.path.normpath(folder_path))
307
+
308
  # Upload the folder to Huggingface
309
  print(f"Uploading {folder_path} to {repo_id}/{folder_name}...")
310
  url = upload_folder(
 
312
  repo_id=repo_id,
313
  repo_type="dataset",
314
  path_in_repo=folder_name,
315
+ ignore_patterns=[".git/*", ".gitignore"],
316
  )
317
+
318
  # Remove the local folder after successful upload
319
  print(f"Upload complete. Removing local folder {folder_path}...")
320
  shutil.rmtree(folder_path)
321
  print("Local folder removed successfully.")
322
+
323
  return url
324
+
325
  except Exception as e:
326
  print(f"Error during upload or cleanup: {str(e)}")
327
  raise
328
 
329
+
330
  def cleanup_sandboxes():
331
  """Remove sandboxes that haven't been accessed for more than 5 minutes"""
332
  current_time = time.time()
333
  sandboxes_to_remove = []
334
+
335
  for session_id, metadata in SANDBOX_METADATA.items():
336
+ if current_time - metadata["last_accessed"] > SANDBOX_TIMEOUT:
337
  sandboxes_to_remove.append(session_id)
338
+
339
  for session_id in sandboxes_to_remove:
340
  if session_id in SANDBOXES:
341
  try:
 
343
  data_dir = os.path.join(TMP_DIR, session_id)
344
  if os.path.exists(data_dir):
345
  upload_to_hf_and_remove(data_dir)
346
+
347
  # Close the sandbox
348
  SANDBOXES[session_id].kill()
349
  del SANDBOXES[session_id]
 
352
  except Exception as e:
353
  print(f"Error cleaning up sandbox {session_id}: {str(e)}")
354
 
355
+
356
  def get_or_create_sandbox(session_uuid):
357
  current_time = time.time()
358
 
359
+ if (
360
+ session_uuid in SANDBOXES
361
+ and session_uuid in SANDBOX_METADATA
362
+ and current_time - SANDBOX_METADATA[session_uuid]["created_at"]
363
+ < SANDBOX_TIMEOUT
364
+ ):
365
  print(f"Reusing Sandbox for {session_uuid}")
366
+ SANDBOX_METADATA[session_uuid]["last_accessed"] = current_time
367
  return SANDBOXES[session_uuid]
368
 
369
  if session_uuid in SANDBOXES:
 
374
  print(f"Error closing expired sandbox: {str(e)}")
375
 
376
  print(f"Creating new sandbox for session {session_uuid}")
377
+ desktop = Sandbox(
378
+ api_key=E2B_API_KEY,
379
+ resolution=(WIDTH, HEIGHT),
380
+ dpi=96,
381
+ timeout=SANDBOX_TIMEOUT,
382
+ template="k0wmnzir0zuzye6dndlw",
383
+ )
384
  desktop.stream.start(require_auth=True)
385
  setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
386
  desktop.commands.run(setup_cmd)
387
 
388
  SANDBOXES[session_uuid] = desktop
389
  SANDBOX_METADATA[session_uuid] = {
390
+ "created_at": current_time,
391
+ "last_accessed": current_time,
392
  }
393
  return desktop
394
 
395
+
396
  def update_html(interactive_mode: bool, session_uuid):
397
  desktop = get_or_create_sandbox(session_uuid)
398
  auth_key = desktop.stream.get_auth_key()
399
  base_url = desktop.stream.get_url(auth_key=auth_key)
400
  stream_url = base_url if interactive_mode else f"{base_url}&view_only=true"
401
+
402
  status_class = "status-interactive" if interactive_mode else "status-view-only"
403
  status_text = "Interactive" if interactive_mode else "Agent running..."
404
+ creation_time = (
405
+ SANDBOX_METADATA[session_uuid]["created_at"]
406
+ if session_uuid in SANDBOX_METADATA
407
+ else time.time()
408
+ )
409
 
410
  sandbox_html_content = sandbox_html_template.format(
411
  stream_url=stream_url,
 
419
  def generate_interaction_id(session_uuid):
420
  return f"{session_uuid}_{int(time.time())}"
421
 
422
+
423
  def chat_message_to_json(obj):
424
  """Custom JSON serializer for ChatMessage and related objects"""
425
+ if hasattr(obj, "__dict__"):
426
  # Create a copy of the object's __dict__ to avoid modifying the original
427
  result = obj.__dict__.copy()
428
+
429
  # Remove the 'raw' field which may contain non-serializable data
430
+ if "raw" in result:
431
+ del result["raw"]
432
+
433
  # Process the content or tool_calls if they exist
434
+ if "content" in result and result["content"] is not None:
435
+ if hasattr(result["content"], "__dict__"):
436
+ result["content"] = chat_message_to_json(result["content"])
437
+
438
+ if "tool_calls" in result and result["tool_calls"] is not None:
439
+ result["tool_calls"] = [
440
+ chat_message_to_json(tc) for tc in result["tool_calls"]
441
+ ]
442
+
443
  return result
444
  elif isinstance(obj, (list, tuple)):
445
  return [chat_message_to_json(item) for item in obj]
 
447
  return obj
448
 
449
 
450
+ def save_final_status(folder, status: str, summary, error_message=None) -> None:
451
  metadata_path = os.path.join(folder, "metadata.json")
452
  output_file = open(metadata_path, "w")
453
+ output_file.write(
454
+ json.dumps(
455
+ {"status": status, "summary": summary, "error_message": error_message},
456
+ default=chat_message_to_json,
457
+ )
458
+ )
459
  output_file.close()
460
 
461
+
462
  def extract_browser_uuid(js_uuid):
463
  print(f"[BROWSER] Got browser UUID from JS: {js_uuid}")
464
  return js_uuid
465
 
466
+
467
  def initialize_session(request: gr.Request, interactive_mode, browser_uuid):
468
  if not browser_uuid:
469
  new_uuid = str(uuid.uuid4())
 
477
  def create_agent(data_dir, desktop):
478
  model = QwenVLAPIModel(
479
  model_id="Qwen/Qwen2.5-VL-72B-Instruct",
480
+ hf_token=hf_token,
481
  )
482
 
483
  # model = OpenAIServerModel(
 
490
  max_steps=200,
491
  verbosity_level=2,
492
  # planning_interval=10,
493
+ use_v1_prompt=True,
494
  )
495
 
496
+
497
  def get_agent_summary_erase_images(agent):
498
  for memory_step in agent.memory.steps:
499
  if getattr(memory_step, "observations_images", None):
500
  memory_step.observations_images = None
501
  return agent.memory.get_succinct_steps()
502
 
503
+
504
  class EnrichedGradioUI(GradioUI):
505
  def log_user_message(self, text_input):
506
  import gradio as gr
 
510
  gr.Button(interactive=False),
511
  )
512
 
513
+ def interact_with_agent(
514
+ self,
515
+ task_input,
516
+ stored_messages,
517
+ session_state,
518
+ session_uuid,
519
+ consent_storage,
520
+ request: gr.Request,
521
+ ):
522
  interaction_id = generate_interaction_id(session_uuid)
523
  desktop = get_or_create_sandbox(session_uuid)
524
 
 
535
  stored_messages.append(gr.ChatMessage(role="user", content=task_input))
536
  yield stored_messages
537
 
538
+ screenshot_bytes = session_state["agent"].desktop.screenshot(format="bytes")
539
+ initial_screenshot = Image.open(BytesIO(screenshot_bytes))
540
+
541
+ for msg in stream_to_gradio(
542
+ session_state["agent"],
543
+ task=task_input,
544
+ task_images=[initial_screenshot],
545
+ reset_agent_memory=False,
546
+ ):
547
+ if (
548
+ hasattr(session_state["agent"], "last_marked_screenshot")
549
+ and msg.content == "-----"
550
+ ): # Append the last screenshot before the end of step
551
+ stored_messages.append(
552
+ gr.ChatMessage(
553
+ role="assistant",
554
+ content={
555
+ "path": session_state[
556
+ "agent"
557
+ ].last_marked_screenshot.to_string(),
558
+ "mime_type": "image/png",
559
+ },
560
+ )
561
+ )
562
  stored_messages.append(msg)
563
  yield stored_messages
564
 
 
567
  # summary = get_agent_summary_erase_images(session_state["agent"])
568
  # save_final_status(data_dir, "completed", summary = summary)
569
  yield stored_messages
570
+
571
  except Exception as e:
572
+ error_message = f"Error in interaction: {str(e)}"
573
  raise e
574
  print(error_message)
575
+ stored_messages.append(
576
+ gr.ChatMessage(
577
+ role="assistant", content="Run failed:\n" + error_message
578
+ )
579
+ )
580
  if consent_storage:
581
  summary = get_agent_summary_erase_images(session_state["agent"])
582
+ save_final_status(
583
+ data_dir, "failed", summary=summary, error_message=error_message
584
+ )
585
  yield stored_messages
586
  finally:
587
  if consent_storage:
588
  upload_to_hf_and_remove(data_dir)
589
 
590
+
591
+ theme = gr.themes.Default(
592
+ font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue"
593
+ )
594
 
595
  # Create a Gradio app with Blocks
596
  with gr.Blocks(theme=theme, css=custom_css, js=custom_js) as demo:
597
+ # Storing session hash in a state variable
598
  session_uuid_state = gr.State(None)
599
 
 
 
600
  with gr.Row():
601
  sandbox_html = gr.HTML(
602
  value=sandbox_html_template.format(
603
  stream_url="",
604
  status_class="status-interactive",
605
+ status_text="Interactive",
606
  ),
607
+ label="Output",
608
  )
609
  with gr.Sidebar(position="left"):
610
  with Modal(visible=True) as modal:
 
618
  task_input = gr.Textbox(
619
  value="Find me pictures of cute puppies",
620
  label="Enter your task below:",
621
+ elem_classes="primary-color-label",
622
  )
623
 
624
  run_btn = gr.Button("Let's go!", variant="primary")
 
633
  "Go on the Hugging Face Hub, find the space for FLUX1.dev, then generate a picture of the Golden Gate bridge",
634
  "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
635
  ],
636
+ inputs=task_input,
637
+ label="Example Tasks",
638
+ examples_per_page=4,
639
  )
640
 
641
  session_state = gr.State({})
 
643
 
644
  minimalist_toggle = gr.Checkbox(label="Innie/Outie", value=False)
645
 
646
+ consent_storage = gr.Checkbox(
647
+ label="Store task and agent trace?", value=True
648
+ )
649
 
650
  def apply_theme(minimalist_mode: bool):
651
  if not minimalist_mode:
 
691
  # Hidden HTML element to inject CSS dynamically
692
  theme_styles = gr.HTML(apply_theme(False), visible=False)
693
  minimalist_toggle.change(
694
+ fn=apply_theme, inputs=[minimalist_toggle], outputs=[theme_styles]
 
 
 
 
 
 
 
695
  )
696
 
697
+ footer = gr.HTML(value=footer_html, label="Header")
698
 
699
  chatbot_display = gr.Chatbot(
700
  elem_id="chatbot",
 
707
  resizable=True,
708
  )
709
 
710
+ agent_ui = EnrichedGradioUI(
711
+ CodeAgent(tools=[], model=None, name="ok", description="ok")
712
+ )
713
 
714
  stop_btn = gr.Button("Stop the agent!", variant="huggingface")
715
 
 
720
 
721
  if not os.path.exists(log_file):
722
  return "Waiting for machine from the future to boot..."
723
+
724
  try:
725
+ with open(log_file, "r") as f:
726
  lines = f.readlines()
727
  return "".join(lines[-tail:] if len(lines) > tail else lines)
728
  except Exception as e:
 
741
  is_interactive = gr.Checkbox(value=True, visible=False)
742
 
743
  # Chain the events
744
+ run_event = (
745
+ run_btn.click(
746
+ fn=clear_and_set_view_only,
747
+ inputs=[task_input, session_uuid_state],
748
+ outputs=[sandbox_html],
749
+ )
750
+ .then(
751
+ agent_ui.interact_with_agent,
752
+ inputs=[
753
+ task_input,
754
+ stored_messages,
755
+ session_state,
756
+ session_uuid_state,
757
+ consent_storage,
758
+ ],
759
+ outputs=[chatbot_display],
760
+ )
761
+ .then(fn=set_interactive, inputs=[session_uuid_state], outputs=[sandbox_html])
762
+ .then(fn=reactivate_stop_btn, outputs=[stop_btn])
763
  )
764
 
765
  def interrupt_agent(session_state):
 
776
 
777
  # replay_btn.click(
778
  # fn=clear_and_set_view_only,
779
+ # inputs=[task_input],
780
  # outputs=[sandbox_html]
781
  # ).then(
782
  # set_logs_source,
 
804
  # Launch the app
805
  if __name__ == "__main__":
806
  Timer(60, cleanup_sandboxes).start() # Run every minute
807
+ demo.launch()
e2bqwen.py CHANGED
@@ -10,7 +10,7 @@ from PIL import Image
10
 
11
  # SmolaAgents imports
12
  from smolagents import CodeAgent, tool, HfApiModel
13
- from smolagents.memory import ActionStep
14
  from smolagents.models import ChatMessage, Model
15
  from smolagents.agents import populate_template
16
  from smolagents.monitoring import LogLevel
@@ -144,6 +144,7 @@ NEVER CLICK THE WEB BROWSER ICON TO OPEN THE WEB BROWSER: use open_url
144
  </general_guidelines>
145
  """
146
 
 
147
  def draw_marker_on_image(image_copy, click_coordinates):
148
  x, y = click_coordinates
149
  draw = ImageDraw.Draw(image_copy)
@@ -152,12 +153,22 @@ def draw_marker_on_image(image_copy, click_coordinates):
152
  draw.line((x - cross_size, y, x + cross_size, y), fill="green", width=linewidth)
153
  draw.line((x, y - cross_size, x, y + cross_size), fill="green", width=linewidth)
154
  # Add a circle around it for better visibility
155
- draw.ellipse((x - cross_size * 2, y - cross_size * 2, x + cross_size * 2, y + cross_size * 2), outline="green", width=linewidth)
 
 
 
 
 
 
 
 
 
156
  return image_copy
157
 
158
 
159
  class E2BVisionAgent(CodeAgent):
160
  """Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
 
161
  def __init__(
162
  self,
163
  model: HfApiModel,
@@ -168,7 +179,7 @@ class E2BVisionAgent(CodeAgent):
168
  verbosity_level: LogLevel = 2,
169
  planning_interval: int = None,
170
  use_v1_prompt: bool = False,
171
- **kwargs
172
  ):
173
  self.desktop = desktop
174
  self.data_dir = data_dir
@@ -188,10 +199,12 @@ class E2BVisionAgent(CodeAgent):
188
  model=model,
189
  max_steps=max_steps,
190
  verbosity_level=verbosity_level,
191
- planning_interval = self.planning_interval,
192
- **kwargs
193
  )
194
- self.prompt_templates["system_prompt"] = E2B_SYSTEM_PROMPT_TEMPLATE.replace("<<resolution_x>>", str(self.width)).replace("<<resolution_y>>", str(self.height))
 
 
195
 
196
  # Add screen info to state
197
  self.state["screen_width"] = self.width
@@ -203,7 +216,7 @@ class E2BVisionAgent(CodeAgent):
203
  self.step_callbacks.append(self.take_screenshot_callback)
204
 
205
  def initialize_system_prompt(self) -> str:
206
- if True:
207
  return """You are a desktop automation assistant that can control a remote desktop environment.
208
  You only have access to the following tools to interact with the desktop, no additional ones:
209
  - click(x, y): Performs a left-click at the specified coordinates
@@ -282,11 +295,14 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
282
  ),
283
  },
284
  )
285
- assert system_prompt != self.prompt_templates["system_prompt"], "Populating prompt template failed"
 
 
286
  return system_prompt
287
 
288
  def _setup_desktop_tools(self):
289
  """Register all desktop tools"""
 
290
  @tool
291
  def click(x: int, y: int) -> str:
292
  """
@@ -342,7 +358,11 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
342
  return f"Moved mouse to coordinates ({x}, {y})"
343
 
344
  def normalize_text(text):
345
- return ''.join(c for c in unicodedata.normalize('NFD', text) if not unicodedata.combining(c))
 
 
 
 
346
 
347
  @tool
348
  def type_text(text: str) -> str:
@@ -469,7 +489,6 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
469
  self.tools["drag_and_drop"] = drag_and_drop
470
  self.tools["find_on_page_ctrl_f"] = find_on_page_ctrl_f
471
 
472
-
473
  def take_screenshot_callback(self, memory_step: ActionStep, agent=None) -> None:
474
  """Callback that takes a screenshot + memory snapshot after a step completes"""
475
  self.logger.log("Analyzing screen content...")
@@ -493,21 +512,31 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
493
  self.last_marked_screenshot = AgentImage(screenshot_path)
494
  print(f"Saved screenshot for step {current_step} to {screenshot_path}")
495
 
496
- for (
497
- previous_memory_step
498
- ) in agent.memory.steps: # Remove previous screenshots from logs for lean processing
499
  if (
500
  isinstance(previous_memory_step, ActionStep)
501
  and previous_memory_step.step_number <= current_step - 1
502
  ):
503
  previous_memory_step.observations_images = None
 
 
504
 
505
  if (
506
  isinstance(previous_memory_step, ActionStep)
507
  and previous_memory_step.step_number == current_step - 1
508
  ):
509
- if previous_memory_step.tool_calls and getattr(previous_memory_step.tool_calls[0], "arguments", None) and memory_step.tool_calls and getattr(memory_step.tool_calls[0], "arguments", None):
510
- if previous_memory_step.tool_calls[0].arguments == memory_step.tool_calls[0].arguments:
 
 
 
 
 
 
 
 
511
  memory_step.observations += "\nWARNING: You've executed the same action several times in a row. MAKE SURE TO NOT UNNECESSARILY REPEAT ACTIONS."
512
 
513
  # Add the marker-edited image to the current memory step
@@ -515,8 +544,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
515
 
516
  # memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
517
 
518
- self.click_coordinates = None # Reset click marker
519
-
520
 
521
  def close(self):
522
  """Clean up resources"""
@@ -529,9 +557,9 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
529
 
530
  class QwenVLAPIModel(Model):
531
  """Model wrapper for Qwen2.5VL API with fallback mechanism"""
532
-
533
  def __init__(
534
- self,
535
  model_id: str = "Qwen/Qwen2.5-VL-72B-Instruct",
536
  hf_token: str = None,
537
  ):
@@ -548,25 +576,22 @@ class QwenVLAPIModel(Model):
548
  token=hf_token,
549
  max_tokens=4096,
550
  )
551
-
552
  def __call__(
553
- self,
554
- messages: List[Dict[str, Any]],
555
- stop_sequences: Optional[List[str]] = None,
556
- **kwargs
557
  ) -> ChatMessage:
558
-
559
  try:
560
  message = self.base_model(messages, stop_sequences, **kwargs)
561
  return message
562
  except Exception as e:
563
- raise e
564
  print(f"Base model failed with error: {e}. Calling fallback model.")
565
-
566
  # Continue to fallback
567
  try:
568
  message = self.fallback_model(messages, stop_sequences, **kwargs)
569
  return message
570
  except Exception as e:
571
- raise e
572
  raise Exception(f"Both endpoints failed. Last error: {e}")
 
10
 
11
  # SmolaAgents imports
12
  from smolagents import CodeAgent, tool, HfApiModel
13
+ from smolagents.memory import ActionStep, TaskStep
14
  from smolagents.models import ChatMessage, Model
15
  from smolagents.agents import populate_template
16
  from smolagents.monitoring import LogLevel
 
144
  </general_guidelines>
145
  """
146
 
147
+
148
  def draw_marker_on_image(image_copy, click_coordinates):
149
  x, y = click_coordinates
150
  draw = ImageDraw.Draw(image_copy)
 
153
  draw.line((x - cross_size, y, x + cross_size, y), fill="green", width=linewidth)
154
  draw.line((x, y - cross_size, x, y + cross_size), fill="green", width=linewidth)
155
  # Add a circle around it for better visibility
156
+ draw.ellipse(
157
+ (
158
+ x - cross_size * 2,
159
+ y - cross_size * 2,
160
+ x + cross_size * 2,
161
+ y + cross_size * 2,
162
+ ),
163
+ outline="green",
164
+ width=linewidth,
165
+ )
166
  return image_copy
167
 
168
 
169
  class E2BVisionAgent(CodeAgent):
170
  """Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
171
+
172
  def __init__(
173
  self,
174
  model: HfApiModel,
 
179
  verbosity_level: LogLevel = 2,
180
  planning_interval: int = None,
181
  use_v1_prompt: bool = False,
182
+ **kwargs,
183
  ):
184
  self.desktop = desktop
185
  self.data_dir = data_dir
 
199
  model=model,
200
  max_steps=max_steps,
201
  verbosity_level=verbosity_level,
202
+ planning_interval=self.planning_interval,
203
+ **kwargs,
204
  )
205
+ self.prompt_templates["system_prompt"] = E2B_SYSTEM_PROMPT_TEMPLATE.replace(
206
+ "<<resolution_x>>", str(self.width)
207
+ ).replace("<<resolution_y>>", str(self.height))
208
 
209
  # Add screen info to state
210
  self.state["screen_width"] = self.width
 
216
  self.step_callbacks.append(self.take_screenshot_callback)
217
 
218
  def initialize_system_prompt(self) -> str:
219
+ if False:
220
  return """You are a desktop automation assistant that can control a remote desktop environment.
221
  You only have access to the following tools to interact with the desktop, no additional ones:
222
  - click(x, y): Performs a left-click at the specified coordinates
 
295
  ),
296
  },
297
  )
298
+ assert system_prompt != self.prompt_templates["system_prompt"], (
299
+ "Populating prompt template failed"
300
+ )
301
  return system_prompt
302
 
303
  def _setup_desktop_tools(self):
304
  """Register all desktop tools"""
305
+
306
  @tool
307
  def click(x: int, y: int) -> str:
308
  """
 
358
  return f"Moved mouse to coordinates ({x}, {y})"
359
 
360
  def normalize_text(text):
361
+ return "".join(
362
+ c
363
+ for c in unicodedata.normalize("NFD", text)
364
+ if not unicodedata.combining(c)
365
+ )
366
 
367
  @tool
368
  def type_text(text: str) -> str:
 
489
  self.tools["drag_and_drop"] = drag_and_drop
490
  self.tools["find_on_page_ctrl_f"] = find_on_page_ctrl_f
491
 
 
492
  def take_screenshot_callback(self, memory_step: ActionStep, agent=None) -> None:
493
  """Callback that takes a screenshot + memory snapshot after a step completes"""
494
  self.logger.log("Analyzing screen content...")
 
512
  self.last_marked_screenshot = AgentImage(screenshot_path)
513
  print(f"Saved screenshot for step {current_step} to {screenshot_path}")
514
 
515
+ for previous_memory_step in (
516
+ agent.memory.steps
517
+ ): # Remove previous screenshots from logs for lean processing
518
  if (
519
  isinstance(previous_memory_step, ActionStep)
520
  and previous_memory_step.step_number <= current_step - 1
521
  ):
522
  previous_memory_step.observations_images = None
523
+ elif isinstance(previous_memory_step, TaskStep):
524
+ previous_memory_step.observations_images = None
525
 
526
  if (
527
  isinstance(previous_memory_step, ActionStep)
528
  and previous_memory_step.step_number == current_step - 1
529
  ):
530
+ if (
531
+ previous_memory_step.tool_calls
532
+ and getattr(previous_memory_step.tool_calls[0], "arguments", None)
533
+ and memory_step.tool_calls
534
+ and getattr(memory_step.tool_calls[0], "arguments", None)
535
+ ):
536
+ if (
537
+ previous_memory_step.tool_calls[0].arguments
538
+ == memory_step.tool_calls[0].arguments
539
+ ):
540
  memory_step.observations += "\nWARNING: You've executed the same action several times in a row. MAKE SURE TO NOT UNNECESSARILY REPEAT ACTIONS."
541
 
542
  # Add the marker-edited image to the current memory step
 
544
 
545
  # memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
546
 
547
+ self.click_coordinates = None # Reset click marker
 
548
 
549
  def close(self):
550
  """Clean up resources"""
 
557
 
558
  class QwenVLAPIModel(Model):
559
  """Model wrapper for Qwen2.5VL API with fallback mechanism"""
560
+
561
  def __init__(
562
+ self,
563
  model_id: str = "Qwen/Qwen2.5-VL-72B-Instruct",
564
  hf_token: str = None,
565
  ):
 
576
  token=hf_token,
577
  max_tokens=4096,
578
  )
579
+
580
  def __call__(
581
+ self,
582
+ messages: List[Dict[str, Any]],
583
+ stop_sequences: Optional[List[str]] = None,
584
+ **kwargs,
585
  ) -> ChatMessage:
 
586
  try:
587
  message = self.base_model(messages, stop_sequences, **kwargs)
588
  return message
589
  except Exception as e:
 
590
  print(f"Base model failed with error: {e}. Calling fallback model.")
591
+
592
  # Continue to fallback
593
  try:
594
  message = self.fallback_model(messages, stop_sequences, **kwargs)
595
  return message
596
  except Exception as e:
 
597
  raise Exception(f"Both endpoints failed. Last error: {e}")
eval.py CHANGED
@@ -1,19 +1,14 @@
1
  import os
2
  import json
3
- import shutil
4
- import time
5
  import argparse
6
  import subprocess
7
- import traceback
8
  import threading
9
  import concurrent.futures
10
  from datetime import datetime
11
- from threading import Timer
12
  from e2b_desktop import Sandbox
13
  from huggingface_hub import get_token
14
-
15
- from smolagents import CodeAgent, OpenAIServerModel
16
- from smolagents.monitoring import LogLevel
17
  from e2bqwen import QwenVLAPIModel, E2BVisionAgent
18
 
19
  from dotenv import load_dotenv
@@ -27,7 +22,9 @@ try:
27
  if not HUGGINGFACE_API_KEY:
28
  HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
29
  if not HUGGINGFACE_API_KEY:
30
- raise ValueError("No Hugging Face token found. Please login with `huggingface-cli login` or set HUGGINGFACE_API_KEY environment variable")
 
 
31
  except ImportError:
32
  # Fall back if huggingface_hub is old version without get_token
33
  HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
@@ -38,24 +35,29 @@ SANDBOX_TIMEOUT = 600 # 10 minutes
38
  # Thread lock for print statements to avoid garbled output
39
  print_lock = threading.Lock()
40
 
 
41
  def thread_safe_print(*args, **kwargs):
42
  """Thread-safe print function"""
43
  with print_lock:
44
  print(*args, **kwargs)
45
 
 
46
  # Get git hash for folder naming
47
  def get_git_hash():
48
  try:
49
- result = subprocess.run(['git', 'rev-parse', '--short', 'HEAD'],
50
- stdout=subprocess.PIPE,
51
- stderr=subprocess.PIPE,
52
- text=True)
 
 
53
  if result.returncode == 0:
54
  return result.stdout.strip()
55
  return "nogit"
56
  except:
57
  return "nogit"
58
 
 
59
  def create_agent(data_dir, desktop, max_steps: int):
60
  """Create an agent with the E2B desktop sandbox"""
61
  model = QwenVLAPIModel(
@@ -75,6 +77,7 @@ def create_agent(data_dir, desktop, max_steps: int):
75
  planning_interval=10,
76
  )
77
 
 
78
  def get_agent_summary_erase_images(agent):
79
  """Get agent summary and erase images to save space"""
80
  for memory_step in agent.memory.steps:
@@ -82,82 +85,104 @@ def get_agent_summary_erase_images(agent):
82
  memory_step.observations_images = None
83
  return agent.memory.get_succinct_steps()
84
 
 
85
  def chat_message_to_json(obj):
86
  """Custom JSON serializer for ChatMessage and related objects"""
87
- if hasattr(obj, '__dict__'):
88
  # Create a copy of the object's __dict__ to avoid modifying the original
89
  result = obj.__dict__.copy()
90
-
91
  # Remove the 'raw' field which may contain non-serializable data
92
- if 'raw' in result:
93
- del result['raw']
94
-
95
  # Process the content or tool_calls if they exist
96
- if 'content' in result and result['content'] is not None:
97
- if hasattr(result['content'], '__dict__'):
98
- result['content'] = chat_message_to_json(result['content'])
99
-
100
- if 'tool_calls' in result and result['tool_calls'] is not None:
101
- result['tool_calls'] = [chat_message_to_json(tc) for tc in result['tool_calls']]
102
-
 
 
103
  return result
104
  elif isinstance(obj, (list, tuple)):
105
  return [chat_message_to_json(item) for item in obj]
106
  else:
107
  return obj
108
 
 
109
  def save_final_status(folder, status: str, summary, error_message=None) -> None:
110
  """Save metadata about the run"""
111
  metadata_path = os.path.join(folder, "metadata.json")
112
  with open(metadata_path, "w") as output_file:
113
- output_file.write(json.dumps({
114
- "status": status,
115
- "summary": summary,
116
- "error_message": error_message
117
- }, default=chat_message_to_json))
 
 
118
 
119
  def run_example_once(example_name, example_text, run_index, example_dir, max_steps):
120
  """Run a single example once and return the result"""
121
  run_dir = os.path.join(example_dir, f"run_{run_index}")
122
  os.makedirs(run_dir, exist_ok=True)
123
-
124
  # Save the example text
125
  with open(os.path.join(run_dir, "task.txt"), "w") as f:
126
  f.write(example_text)
127
-
128
  thread_safe_print(f" Starting run {run_index} for example '{example_name}'")
129
-
130
  # Create a new sandbox for this run
131
  desktop = None
132
  try:
133
  desktop = Sandbox(
134
- api_key=E2B_API_KEY,
135
- resolution=(WIDTH, HEIGHT),
136
- dpi=96,
137
- timeout=SANDBOX_TIMEOUT
 
138
  )
139
-
140
  # Initialize the desktop environment
141
  setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
142
  desktop.commands.run(setup_cmd)
143
-
144
  # Create and run the agent
145
  agent = create_agent(data_dir=run_dir, desktop=desktop, max_steps=max_steps)
 
 
 
146
  try:
147
- agent.run(task=example_text)
148
  summary = get_agent_summary_erase_images(agent)
149
  save_final_status(run_dir, "completed", summary=summary)
150
- thread_safe_print(f" ✓ Example '{example_name}' run {run_index} completed successfully")
 
 
151
  result = {"status": "completed", "run_dir": run_dir}
152
  except Exception as e:
153
  error_message = f"Error in agent execution: {str(e)}"
154
- thread_safe_print(f" ✗ Example '{example_name}' run {run_index} failed: {error_message}")
155
- summary = get_agent_summary_erase_images(agent) if hasattr(agent, 'memory') else None
156
- save_final_status(run_dir, "failed", summary=summary, error_message=error_message)
 
 
 
 
 
 
 
 
157
  result = {"status": "failed", "run_dir": run_dir, "error": error_message}
158
  except Exception as e:
159
  error_message = f"Error setting up sandbox: {str(e)}"
160
- thread_safe_print(f" ✗ Example '{example_name}' run {run_index} failed: {error_message}")
 
 
161
  save_final_status(run_dir, "failed", summary=None, error_message=error_message)
162
  result = {"status": "failed", "run_dir": run_dir, "error": error_message}
163
  finally:
@@ -167,21 +192,24 @@ def run_example_once(example_name, example_text, run_index, example_dir, max_ste
167
  desktop.kill()
168
  except:
169
  pass
170
-
171
  return result
172
 
 
173
  def run_example(example_name, example_text, num_runs, example_dir, max_steps):
174
  """Run a single example multiple times using threads for each run"""
175
  thread_safe_print(f"\nRunning example '{example_name}': '{example_text[:50]}...'")
176
-
177
  results = []
178
  with concurrent.futures.ThreadPoolExecutor(max_workers=num_runs) as executor:
179
  # Submit all runs to the executor
180
  future_to_run = {
181
- executor.submit(run_example_once, example_name, example_text, j, example_dir, max_steps): j
 
 
182
  for j in range(num_runs)
183
  }
184
-
185
  # Collect results as they complete
186
  for future in concurrent.futures.as_completed(future_to_run):
187
  run_index = future_to_run[future]
@@ -189,31 +217,34 @@ def run_example(example_name, example_text, num_runs, example_dir, max_steps):
189
  result = future.result()
190
  results.append(result)
191
  except Exception as exc:
192
- thread_safe_print(f" ✗ Run {run_index} for '{example_name}' generated an exception: {exc}")
193
- results.append({
194
- "status": "error",
195
- "run_index": run_index,
196
- "error": str(exc)
197
- })
198
-
199
  return results
200
 
 
201
  def run_evaluation(examples, num_runs, output_dir, max_parallel, max_steps):
202
  """Run each example n times and save the results"""
203
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
204
  git_hash = get_git_hash()
205
  eval_dir = os.path.join(output_dir, f"eval_{timestamp}_{git_hash}")
206
  os.makedirs(eval_dir, exist_ok=True)
207
-
208
  thread_safe_print(f"Starting evaluation. Results will be saved to: {eval_dir}")
209
- thread_safe_print(f"Will run {len(examples)} examples, {num_runs} times each, with {max_parallel} parallel examples")
210
-
 
 
211
  # Save examples to the evaluation directory
212
  with open(os.path.join(eval_dir, "examples.json"), "w") as f:
213
  json.dump(examples, f, indent=2)
214
-
215
  all_results = {}
216
-
217
  # Run examples in parallel, but limit the number of parallel examples
218
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_parallel) as executor:
219
  # Prepare the example directories first
@@ -222,66 +253,94 @@ def run_evaluation(examples, num_runs, output_dir, max_parallel, max_steps):
222
  example_dir = os.path.join(eval_dir, f"example_{example_name}")
223
  os.makedirs(example_dir, exist_ok=True)
224
  example_dirs[example_name] = example_dir
225
-
226
  # Submit all examples to the executor
227
  future_to_example = {
228
- executor.submit(run_example, example_name, example_text, num_runs, example_dirs[example_name], max_steps): example_name
 
 
 
 
 
 
 
229
  for example_name, example_text in examples.items()
230
  }
231
-
232
  # Collect results as they complete
233
  for future in concurrent.futures.as_completed(future_to_example):
234
  example_name = future_to_example[future]
235
  try:
236
  results = future.result()
237
  all_results[example_name] = results
238
-
239
  # Calculate success rate for this example
240
  success_count = sum(1 for r in results if r["status"] == "completed")
241
- thread_safe_print(f"Example '{example_name}' complete: {success_count}/{num_runs} successful runs ({success_count/num_runs*100:.1f}%)")
 
 
242
  except Exception as exc:
243
- thread_safe_print(f"Example '{example_name}' generated an exception: {exc}")
 
 
244
  all_results[example_name] = [{"status": "error", "error": str(exc)}]
245
-
246
  # Calculate overall results and success rates
247
  success_counts = {
248
  example_name: sum(1 for r in results if r["status"] == "completed")
249
  for example_name, results in all_results.items()
250
  }
251
-
252
  total_runs = sum(len(results) for results in all_results.values())
253
  total_successes = sum(success_counts.values())
254
-
255
  # Save summary to evaluation directory
256
  summary = {
257
  "total_runs": total_runs,
258
  "total_successes": total_successes,
259
  "success_rate": total_successes / total_runs if total_runs > 0 else 0,
260
  "example_success_rates": {
261
- example_name: success_counts[example_name] / len(all_results[example_name])
262
  for example_name in examples
263
- }
264
  }
265
-
266
  with open(os.path.join(eval_dir, "summary.json"), "w") as f:
267
  json.dump(summary, f, indent=2)
268
-
269
  thread_safe_print(f"\nEvaluation complete. Results saved to: {eval_dir}")
270
- thread_safe_print(f"Overall success rate: {summary['success_rate']*100:.1f}% ({total_successes}/{total_runs})")
 
 
271
  for example_name in examples:
272
  success_rate = summary["example_success_rates"][example_name] * 100
273
  thread_safe_print(f"Example '{example_name}': {success_rate:.1f}% success")
274
-
275
  return eval_dir
276
 
 
277
  def main():
278
  parser = argparse.ArgumentParser(description="Evaluate computer agent on examples")
279
- parser.add_argument("--num-runs", type=int, default=3, help="Number of runs per example")
280
- parser.add_argument("--output-dir", type=str, default="./eval_results", help="Output directory for evaluation results")
281
- parser.add_argument("--max-parallel", type=int, default=2, help="Maximum number of examples to run in parallel")
282
- parser.add_argument("--max-steps", type=int, default=200, help="Maximum number of steps in each run")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  args = parser.parse_args()
284
-
285
  # Examples from the original code
286
  examples = {
287
  "puppies": "Find me pictures of cute puppies",
@@ -293,12 +352,15 @@ def main():
293
  "flux": "Go on the Hugging Face Hub, find a Space for FLUX1.dev, and generate a picture of the Golden Gate bridge.",
294
  "hf": "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
295
  }
296
-
297
  # Create output directory if it doesn't exist
298
  os.makedirs(args.output_dir, exist_ok=True)
299
-
300
  # Run the evaluation
301
- run_evaluation(examples, args.num_runs, args.output_dir, args.max_parallel, args.max_steps)
 
 
 
302
 
303
  if __name__ == "__main__":
304
- main()
 
1
  import os
2
  import json
 
 
3
  import argparse
4
  import subprocess
 
5
  import threading
6
  import concurrent.futures
7
  from datetime import datetime
 
8
  from e2b_desktop import Sandbox
9
  from huggingface_hub import get_token
10
+ from io import BytesIO
11
+ from PIL import Image
 
12
  from e2bqwen import QwenVLAPIModel, E2BVisionAgent
13
 
14
  from dotenv import load_dotenv
 
22
  if not HUGGINGFACE_API_KEY:
23
  HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
24
  if not HUGGINGFACE_API_KEY:
25
+ raise ValueError(
26
+ "No Hugging Face token found. Please login with `huggingface-cli login` or set HUGGINGFACE_API_KEY environment variable"
27
+ )
28
  except ImportError:
29
  # Fall back if huggingface_hub is old version without get_token
30
  HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
 
35
  # Thread lock for print statements to avoid garbled output
36
  print_lock = threading.Lock()
37
 
38
+
39
  def thread_safe_print(*args, **kwargs):
40
  """Thread-safe print function"""
41
  with print_lock:
42
  print(*args, **kwargs)
43
 
44
+
45
  # Get git hash for folder naming
46
  def get_git_hash():
47
  try:
48
+ result = subprocess.run(
49
+ ["git", "rev-parse", "--short", "HEAD"],
50
+ stdout=subprocess.PIPE,
51
+ stderr=subprocess.PIPE,
52
+ text=True,
53
+ )
54
  if result.returncode == 0:
55
  return result.stdout.strip()
56
  return "nogit"
57
  except:
58
  return "nogit"
59
 
60
+
61
  def create_agent(data_dir, desktop, max_steps: int):
62
  """Create an agent with the E2B desktop sandbox"""
63
  model = QwenVLAPIModel(
 
77
  planning_interval=10,
78
  )
79
 
80
+
81
  def get_agent_summary_erase_images(agent):
82
  """Get agent summary and erase images to save space"""
83
  for memory_step in agent.memory.steps:
 
85
  memory_step.observations_images = None
86
  return agent.memory.get_succinct_steps()
87
 
88
+
89
  def chat_message_to_json(obj):
90
  """Custom JSON serializer for ChatMessage and related objects"""
91
+ if hasattr(obj, "__dict__"):
92
  # Create a copy of the object's __dict__ to avoid modifying the original
93
  result = obj.__dict__.copy()
94
+
95
  # Remove the 'raw' field which may contain non-serializable data
96
+ if "raw" in result:
97
+ del result["raw"]
98
+
99
  # Process the content or tool_calls if they exist
100
+ if "content" in result and result["content"] is not None:
101
+ if hasattr(result["content"], "__dict__"):
102
+ result["content"] = chat_message_to_json(result["content"])
103
+
104
+ if "tool_calls" in result and result["tool_calls"] is not None:
105
+ result["tool_calls"] = [
106
+ chat_message_to_json(tc) for tc in result["tool_calls"]
107
+ ]
108
+
109
  return result
110
  elif isinstance(obj, (list, tuple)):
111
  return [chat_message_to_json(item) for item in obj]
112
  else:
113
  return obj
114
 
115
+
116
  def save_final_status(folder, status: str, summary, error_message=None) -> None:
117
  """Save metadata about the run"""
118
  metadata_path = os.path.join(folder, "metadata.json")
119
  with open(metadata_path, "w") as output_file:
120
+ output_file.write(
121
+ json.dumps(
122
+ {"status": status, "summary": summary, "error_message": error_message},
123
+ default=chat_message_to_json,
124
+ )
125
+ )
126
+
127
 
128
  def run_example_once(example_name, example_text, run_index, example_dir, max_steps):
129
  """Run a single example once and return the result"""
130
  run_dir = os.path.join(example_dir, f"run_{run_index}")
131
  os.makedirs(run_dir, exist_ok=True)
132
+
133
  # Save the example text
134
  with open(os.path.join(run_dir, "task.txt"), "w") as f:
135
  f.write(example_text)
136
+
137
  thread_safe_print(f" Starting run {run_index} for example '{example_name}'")
138
+
139
  # Create a new sandbox for this run
140
  desktop = None
141
  try:
142
  desktop = Sandbox(
143
+ api_key=E2B_API_KEY,
144
+ resolution=(WIDTH, HEIGHT),
145
+ dpi=96,
146
+ timeout=SANDBOX_TIMEOUT,
147
+ template="k0wmnzir0zuzye6dndlw",
148
  )
149
+
150
  # Initialize the desktop environment
151
  setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
152
  desktop.commands.run(setup_cmd)
153
+
154
  # Create and run the agent
155
  agent = create_agent(data_dir=run_dir, desktop=desktop, max_steps=max_steps)
156
+
157
+ screenshot_bytes = desktop.screenshot(format="bytes")
158
+ initial_screenshot = Image.open(BytesIO(screenshot_bytes))
159
  try:
160
+ agent.run(task=example_text, images=[initial_screenshot])
161
  summary = get_agent_summary_erase_images(agent)
162
  save_final_status(run_dir, "completed", summary=summary)
163
+ thread_safe_print(
164
+ f" ✓ Example '{example_name}' run {run_index} completed successfully"
165
+ )
166
  result = {"status": "completed", "run_dir": run_dir}
167
  except Exception as e:
168
  error_message = f"Error in agent execution: {str(e)}"
169
+ thread_safe_print(
170
+ f" ✗ Example '{example_name}' run {run_index} failed: {error_message}"
171
+ )
172
+ summary = (
173
+ get_agent_summary_erase_images(agent)
174
+ if hasattr(agent, "memory")
175
+ else None
176
+ )
177
+ save_final_status(
178
+ run_dir, "failed", summary=summary, error_message=error_message
179
+ )
180
  result = {"status": "failed", "run_dir": run_dir, "error": error_message}
181
  except Exception as e:
182
  error_message = f"Error setting up sandbox: {str(e)}"
183
+ thread_safe_print(
184
+ f" ✗ Example '{example_name}' run {run_index} failed: {error_message}"
185
+ )
186
  save_final_status(run_dir, "failed", summary=None, error_message=error_message)
187
  result = {"status": "failed", "run_dir": run_dir, "error": error_message}
188
  finally:
 
192
  desktop.kill()
193
  except:
194
  pass
195
+
196
  return result
197
 
198
+
199
  def run_example(example_name, example_text, num_runs, example_dir, max_steps):
200
  """Run a single example multiple times using threads for each run"""
201
  thread_safe_print(f"\nRunning example '{example_name}': '{example_text[:50]}...'")
202
+
203
  results = []
204
  with concurrent.futures.ThreadPoolExecutor(max_workers=num_runs) as executor:
205
  # Submit all runs to the executor
206
  future_to_run = {
207
+ executor.submit(
208
+ run_example_once, example_name, example_text, j, example_dir, max_steps
209
+ ): j
210
  for j in range(num_runs)
211
  }
212
+
213
  # Collect results as they complete
214
  for future in concurrent.futures.as_completed(future_to_run):
215
  run_index = future_to_run[future]
 
217
  result = future.result()
218
  results.append(result)
219
  except Exception as exc:
220
+ thread_safe_print(
221
+ f" ✗ Run {run_index} for '{example_name}' generated an exception: {exc}"
222
+ )
223
+ results.append(
224
+ {"status": "error", "run_index": run_index, "error": str(exc)}
225
+ )
226
+
227
  return results
228
 
229
+
230
  def run_evaluation(examples, num_runs, output_dir, max_parallel, max_steps):
231
  """Run each example n times and save the results"""
232
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
233
  git_hash = get_git_hash()
234
  eval_dir = os.path.join(output_dir, f"eval_{timestamp}_{git_hash}")
235
  os.makedirs(eval_dir, exist_ok=True)
236
+
237
  thread_safe_print(f"Starting evaluation. Results will be saved to: {eval_dir}")
238
+ thread_safe_print(
239
+ f"Will run {len(examples)} examples, {num_runs} times each, with {max_parallel} parallel examples"
240
+ )
241
+
242
  # Save examples to the evaluation directory
243
  with open(os.path.join(eval_dir, "examples.json"), "w") as f:
244
  json.dump(examples, f, indent=2)
245
+
246
  all_results = {}
247
+
248
  # Run examples in parallel, but limit the number of parallel examples
249
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_parallel) as executor:
250
  # Prepare the example directories first
 
253
  example_dir = os.path.join(eval_dir, f"example_{example_name}")
254
  os.makedirs(example_dir, exist_ok=True)
255
  example_dirs[example_name] = example_dir
256
+
257
  # Submit all examples to the executor
258
  future_to_example = {
259
+ executor.submit(
260
+ run_example,
261
+ example_name,
262
+ example_text,
263
+ num_runs,
264
+ example_dirs[example_name],
265
+ max_steps,
266
+ ): example_name
267
  for example_name, example_text in examples.items()
268
  }
269
+
270
  # Collect results as they complete
271
  for future in concurrent.futures.as_completed(future_to_example):
272
  example_name = future_to_example[future]
273
  try:
274
  results = future.result()
275
  all_results[example_name] = results
276
+
277
  # Calculate success rate for this example
278
  success_count = sum(1 for r in results if r["status"] == "completed")
279
+ thread_safe_print(
280
+ f"Example '{example_name}' complete: {success_count}/{num_runs} successful runs ({success_count / num_runs * 100:.1f}%)"
281
+ )
282
  except Exception as exc:
283
+ thread_safe_print(
284
+ f"Example '{example_name}' generated an exception: {exc}"
285
+ )
286
  all_results[example_name] = [{"status": "error", "error": str(exc)}]
287
+
288
  # Calculate overall results and success rates
289
  success_counts = {
290
  example_name: sum(1 for r in results if r["status"] == "completed")
291
  for example_name, results in all_results.items()
292
  }
293
+
294
  total_runs = sum(len(results) for results in all_results.values())
295
  total_successes = sum(success_counts.values())
296
+
297
  # Save summary to evaluation directory
298
  summary = {
299
  "total_runs": total_runs,
300
  "total_successes": total_successes,
301
  "success_rate": total_successes / total_runs if total_runs > 0 else 0,
302
  "example_success_rates": {
303
+ example_name: success_counts[example_name] / len(all_results[example_name])
304
  for example_name in examples
305
+ },
306
  }
307
+
308
  with open(os.path.join(eval_dir, "summary.json"), "w") as f:
309
  json.dump(summary, f, indent=2)
310
+
311
  thread_safe_print(f"\nEvaluation complete. Results saved to: {eval_dir}")
312
+ thread_safe_print(
313
+ f"Overall success rate: {summary['success_rate'] * 100:.1f}% ({total_successes}/{total_runs})"
314
+ )
315
  for example_name in examples:
316
  success_rate = summary["example_success_rates"][example_name] * 100
317
  thread_safe_print(f"Example '{example_name}': {success_rate:.1f}% success")
318
+
319
  return eval_dir
320
 
321
+
322
  def main():
323
  parser = argparse.ArgumentParser(description="Evaluate computer agent on examples")
324
+ parser.add_argument(
325
+ "--num-runs", type=int, default=3, help="Number of runs per example"
326
+ )
327
+ parser.add_argument(
328
+ "--output-dir",
329
+ type=str,
330
+ default="./eval_results",
331
+ help="Output directory for evaluation results",
332
+ )
333
+ parser.add_argument(
334
+ "--max-parallel",
335
+ type=int,
336
+ default=2,
337
+ help="Maximum number of examples to run in parallel",
338
+ )
339
+ parser.add_argument(
340
+ "--max-steps", type=int, default=200, help="Maximum number of steps in each run"
341
+ )
342
  args = parser.parse_args()
343
+
344
  # Examples from the original code
345
  examples = {
346
  "puppies": "Find me pictures of cute puppies",
 
352
  "flux": "Go on the Hugging Face Hub, find a Space for FLUX1.dev, and generate a picture of the Golden Gate bridge.",
353
  "hf": "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
354
  }
355
+
356
  # Create output directory if it doesn't exist
357
  os.makedirs(args.output_dir, exist_ok=True)
358
+
359
  # Run the evaluation
360
+ run_evaluation(
361
+ examples, args.num_runs, args.output_dir, args.max_parallel, args.max_steps
362
+ )
363
+
364
 
365
  if __name__ == "__main__":
366
+ main()
model_replay.py CHANGED
@@ -7,43 +7,39 @@ import json
7
 
8
  class FakeModelReplayLog(Model):
9
  """A model class that returns pre-recorded responses from a log file.
10
-
11
  This class is useful for testing and debugging purposes, as it doesn't make
12
  actual API calls but instead returns responses from a pre-recorded log file.
13
-
14
  Parameters:
15
- log_url (str, optional):
16
  URL to the log file. Defaults to the smolagents example log.
17
  **kwargs: Additional keyword arguments passed to the Model base class.
18
  """
19
-
20
- def __init__(
21
- self,
22
- log_folder: str,
23
- **kwargs
24
- ):
25
  super().__init__(**kwargs)
26
  self.dataset_name = "smolagents/computer-agent-logs"
27
  self.log_folder = log_folder
28
  self.call_counter = 0
29
  self.model_outputs = self._load_model_outputs()
30
-
31
  def _load_model_outputs(self) -> List[str]:
32
  """Load model outputs from the log file using HuggingFace datasets library."""
33
  # Download the file from Hugging Face Hub
34
  file_path = hf_hub_download(
35
  repo_id=self.dataset_name,
36
  filename=self.log_folder + "/metadata.json",
37
- repo_type="dataset"
38
  )
39
-
40
  # Load and parse the JSON data
41
- with open(file_path, 'r') as f:
42
  log_data = json.load(f)
43
-
44
  # Extract only the model_output from each step in tool_calls
45
  model_outputs = []
46
-
47
  for step in log_data["summary"][1:]:
48
  model_outputs.append(step["model_output_message"]["content"])
49
 
@@ -56,17 +52,17 @@ class FakeModelReplayLog(Model):
56
  stop_sequences: Optional[List[str]] = None,
57
  grammar: Optional[str] = None,
58
  tools_to_call_from: Optional[List[Tool]] = None,
59
- **kwargs
60
  ) -> ChatMessage:
61
  """Return the next pre-recorded response from the log file.
62
-
63
  Parameters:
64
  messages: List of input messages (ignored).
65
  stop_sequences: Optional list of stop sequences (ignored).
66
  grammar: Optional grammar specification (ignored).
67
  tools_to_call_from: Optional list of tools (ignored).
68
  **kwargs: Additional keyword arguments (ignored).
69
-
70
  Returns:
71
  ChatMessage: The next pre-recorded response.
72
  """
@@ -82,12 +78,11 @@ class FakeModelReplayLog(Model):
82
  # Token counts are simulated
83
  self.last_input_token_count = len(str(messages)) // 4 # Rough approximation
84
  self.last_output_token_count = len(content) // 4 # Rough approximation
85
-
86
  # Create and return a ChatMessage
87
  return ChatMessage(
88
  role=MessageRole.ASSISTANT,
89
  content=content,
90
  tool_calls=None,
91
- raw={"source": "pre-recorded log", "call_number": self.call_counter}
92
  )
93
-
 
7
 
8
  class FakeModelReplayLog(Model):
9
  """A model class that returns pre-recorded responses from a log file.
10
+
11
  This class is useful for testing and debugging purposes, as it doesn't make
12
  actual API calls but instead returns responses from a pre-recorded log file.
13
+
14
  Parameters:
15
+ log_url (str, optional):
16
  URL to the log file. Defaults to the smolagents example log.
17
  **kwargs: Additional keyword arguments passed to the Model base class.
18
  """
19
+
20
+ def __init__(self, log_folder: str, **kwargs):
 
 
 
 
21
  super().__init__(**kwargs)
22
  self.dataset_name = "smolagents/computer-agent-logs"
23
  self.log_folder = log_folder
24
  self.call_counter = 0
25
  self.model_outputs = self._load_model_outputs()
26
+
27
  def _load_model_outputs(self) -> List[str]:
28
  """Load model outputs from the log file using HuggingFace datasets library."""
29
  # Download the file from Hugging Face Hub
30
  file_path = hf_hub_download(
31
  repo_id=self.dataset_name,
32
  filename=self.log_folder + "/metadata.json",
33
+ repo_type="dataset",
34
  )
35
+
36
  # Load and parse the JSON data
37
+ with open(file_path, "r") as f:
38
  log_data = json.load(f)
39
+
40
  # Extract only the model_output from each step in tool_calls
41
  model_outputs = []
42
+
43
  for step in log_data["summary"][1:]:
44
  model_outputs.append(step["model_output_message"]["content"])
45
 
 
52
  stop_sequences: Optional[List[str]] = None,
53
  grammar: Optional[str] = None,
54
  tools_to_call_from: Optional[List[Tool]] = None,
55
+ **kwargs,
56
  ) -> ChatMessage:
57
  """Return the next pre-recorded response from the log file.
58
+
59
  Parameters:
60
  messages: List of input messages (ignored).
61
  stop_sequences: Optional list of stop sequences (ignored).
62
  grammar: Optional grammar specification (ignored).
63
  tools_to_call_from: Optional list of tools (ignored).
64
  **kwargs: Additional keyword arguments (ignored).
65
+
66
  Returns:
67
  ChatMessage: The next pre-recorded response.
68
  """
 
78
  # Token counts are simulated
79
  self.last_input_token_count = len(str(messages)) // 4 # Rough approximation
80
  self.last_output_token_count = len(content) // 4 # Rough approximation
81
+
82
  # Create and return a ChatMessage
83
  return ChatMessage(
84
  role=MessageRole.ASSISTANT,
85
  content=content,
86
  tool_calls=None,
87
+ raw={"source": "pre-recorded log", "call_number": self.call_counter},
88
  )
 
show_eval.py CHANGED
@@ -8,136 +8,153 @@ from flask_cors import CORS
8
  app = Flask(__name__)
9
  CORS(app) # Enable CORS for all routes
10
 
 
11
  # Serve the HTML viewer
12
- @app.route('/')
13
  def index():
14
- return render_template('viewer.html')
 
15
 
16
  # Get list of available evaluations
17
- @app.route('/api/evals')
18
  def list_evals():
19
- base_dir = request.args.get('path', './eval_results')
20
  if not os.path.exists(base_dir):
21
  return jsonify({"error": f"Path {base_dir} does not exist"}), 404
22
-
23
  eval_dirs = []
24
  for item in os.listdir(base_dir):
25
  full_path = os.path.join(base_dir, item)
26
- if os.path.isdir(full_path) and item.startswith('eval_'):
27
  eval_dirs.append(item)
28
-
29
  return jsonify(eval_dirs)
30
 
 
31
  # Get examples for an evaluation
32
- @app.route('/api/eval/<eval_id>/examples')
33
  def get_examples(eval_id):
34
- base_dir = request.args.get('path', './eval_results')
35
  eval_path = os.path.join(base_dir, eval_id)
36
-
37
  # Try to read examples.json
38
- examples_json_path = os.path.join(eval_path, 'examples.json')
39
  examples = {}
40
-
41
  if os.path.exists(examples_json_path):
42
  try:
43
- with open(examples_json_path, 'r') as f:
44
  examples = json.load(f)
45
  except json.JSONDecodeError:
46
  app.logger.error(f"Error parsing examples.json at {examples_json_path}")
47
-
48
  # If examples.json doesn't exist or is empty, scan for example directories
49
  if not examples:
50
  for item in os.listdir(eval_path):
51
- if os.path.isdir(os.path.join(eval_path, item)) and item.startswith('example_'):
52
- example_id = item.replace('example_', '')
 
 
53
  example_dir = os.path.join(eval_path, item)
54
-
55
  # Find the first run and read task.txt
56
  run_dirs = []
57
  for run_item in os.listdir(example_dir):
58
  run_path = os.path.join(example_dir, run_item)
59
- if os.path.isdir(run_path) and run_item.startswith('run_'):
60
  run_dirs.append(run_item)
61
-
62
  if run_dirs:
63
- task_path = os.path.join(example_dir, run_dirs[0], 'task.txt')
64
  if os.path.exists(task_path):
65
- with open(task_path, 'r') as f:
66
  examples[example_id] = f.read().strip()
67
  else:
68
  # If no task.txt, try reading from metadata.json
69
- metadata_path = os.path.join(example_dir, run_dirs[0], 'metadata.json')
 
 
70
  if os.path.exists(metadata_path):
71
  try:
72
- with open(metadata_path, 'r') as f:
73
  metadata = json.load(f)
74
  # Look for task in summary[0].task
75
- if 'summary' in metadata and metadata['summary'] and 'task' in metadata['summary'][0]:
76
- examples[example_id] = metadata['summary'][0]['task']
 
 
 
 
 
 
77
  except:
78
  # Default to directory name if all else fails
79
  examples[example_id] = f"Task for {example_id}"
80
  else:
81
  examples[example_id] = f"Task for {example_id}"
82
-
83
  return jsonify(examples)
84
 
 
85
  # Get runs for an example
86
- @app.route('/api/eval/<eval_id>/example/<example_id>/runs')
87
  def get_runs(eval_id, example_id):
88
- base_dir = request.args.get('path', './eval_results')
89
- example_dir = os.path.join(base_dir, eval_id, f'example_{example_id}')
90
-
91
  if not os.path.exists(example_dir):
92
  return jsonify({"error": f"Example directory not found: {example_dir}"}), 404
93
-
94
  runs = []
95
  for item in os.listdir(example_dir):
96
  item_path = os.path.join(example_dir, item)
97
- if os.path.isdir(item_path) and item.startswith('run_'):
98
  run_id = item
99
-
100
  # Try to get status from metadata.json
101
- metadata_path = os.path.join(item_path, 'metadata.json')
102
- status = 'unknown'
103
-
104
  if os.path.exists(metadata_path):
105
  try:
106
- with open(metadata_path, 'r') as f:
107
  metadata = json.load(f)
108
- status = metadata.get('status', 'unknown')
109
  except Exception as e:
110
- app.logger.error(f"Error reading metadata.json for {run_id}: {str(e)}")
111
-
112
- runs.append({'id': run_id, 'status': status})
 
 
113
  app.logger.info(f"runs: {runs}")
114
 
115
  return jsonify(runs)
116
 
 
117
  # Get metadata for a run
118
- @app.route('/api/eval/<eval_id>/example/<example_id>/run/<run_id>/metadata')
119
  def get_metadata(eval_id, example_id, run_id):
120
- base_dir = request.args.get('path', './eval_results')
121
- run_dir = os.path.join(base_dir, eval_id, f'example_{example_id}', run_id)
122
- metadata_path = os.path.join(run_dir, 'metadata.json')
123
  app.logger.info(f"metadata: {metadata_path}")
124
 
125
  if not os.path.exists(metadata_path):
126
  return jsonify({"error": "Metadata not found", "path": metadata_path}), 404
127
-
128
  try:
129
- with open(metadata_path, 'r') as f:
130
  metadata_content = f.read()
131
  if not metadata_content.strip():
132
  return jsonify({"error": "Metadata file is empty"}), 404
133
-
134
  metadata = json.loads(metadata_content)
135
  return jsonify(metadata)
136
  except json.JSONDecodeError as e:
137
  error_info = {
138
  "error": "Invalid JSON in metadata file",
139
  "details": str(e),
140
- "path": metadata_path
141
  }
142
  app.logger.error(f"JSON error in {metadata_path}: {str(e)}")
143
  return jsonify(error_info), 400
@@ -146,54 +163,56 @@ def get_metadata(eval_id, example_id, run_id):
146
  "error": "Error reading metadata file",
147
  "details": str(e),
148
  "traceback": traceback.format_exc(),
149
- "path": metadata_path
150
  }
151
  app.logger.error(f"Error reading {metadata_path}: {str(e)}")
152
  return jsonify(error_info), 500
153
 
 
154
  # Get screenshots for a run
155
- @app.route('/api/eval/<eval_id>/example/<example_id>/run/<run_id>/screenshots')
156
  def get_screenshots(eval_id, example_id, run_id):
157
- base_dir = request.args.get('path', './eval_results')
158
- run_dir = os.path.join(base_dir, eval_id, f'example_{example_id}', run_id)
159
-
160
  if not os.path.exists(run_dir):
161
  return jsonify({"error": f"Run directory not found: {run_dir}"}), 404
162
-
163
  screenshots = []
164
- for ext in ['png', 'jpg', 'jpeg']:
165
- pattern = os.path.join(run_dir, f'*.{ext}')
166
  for file_path in glob.glob(pattern):
167
  filename = os.path.basename(file_path)
168
- screenshots.append({
169
- 'name': filename,
170
- 'path': f'/api/image?path={file_path}'
171
- })
172
-
173
  # Sort by filename
174
- screenshots.sort(key=lambda x: x['name'])
175
 
176
  app.logger.info(f"screenshots: {screenshots}")
177
-
178
  return jsonify(screenshots)
179
 
 
180
  # Serve an image file
181
- @app.route('/api/image')
182
  def get_image():
183
- path = request.args.get('path')
184
  if not path:
185
  return jsonify({"error": "No path provided"}), 400
186
-
187
  if not os.path.exists(path):
188
  return jsonify({"error": f"Image not found at path: {path}"}), 404
189
-
190
  try:
191
  return send_file(path)
192
  except Exception as e:
193
  return jsonify({"error": f"Error serving image: {str(e)}"}), 500
194
 
195
- if __name__ == '__main__':
 
196
  print("Evaluation Server is running at http://localhost:8000")
197
  print("Press Ctrl+C to stop the server")
198
-
199
- app.run(debug=True, port=8000)
 
8
  app = Flask(__name__)
9
  CORS(app) # Enable CORS for all routes
10
 
11
+
12
  # Serve the HTML viewer
13
+ @app.route("/")
14
  def index():
15
+ return render_template("viewer.html")
16
+
17
 
18
  # Get list of available evaluations
19
+ @app.route("/api/evals")
20
  def list_evals():
21
+ base_dir = request.args.get("path", "./eval_results")
22
  if not os.path.exists(base_dir):
23
  return jsonify({"error": f"Path {base_dir} does not exist"}), 404
24
+
25
  eval_dirs = []
26
  for item in os.listdir(base_dir):
27
  full_path = os.path.join(base_dir, item)
28
+ if os.path.isdir(full_path) and item.startswith("eval_"):
29
  eval_dirs.append(item)
30
+
31
  return jsonify(eval_dirs)
32
 
33
+
34
  # Get examples for an evaluation
35
+ @app.route("/api/eval/<eval_id>/examples")
36
  def get_examples(eval_id):
37
+ base_dir = request.args.get("path", "./eval_results")
38
  eval_path = os.path.join(base_dir, eval_id)
39
+
40
  # Try to read examples.json
41
+ examples_json_path = os.path.join(eval_path, "examples.json")
42
  examples = {}
43
+
44
  if os.path.exists(examples_json_path):
45
  try:
46
+ with open(examples_json_path, "r") as f:
47
  examples = json.load(f)
48
  except json.JSONDecodeError:
49
  app.logger.error(f"Error parsing examples.json at {examples_json_path}")
50
+
51
  # If examples.json doesn't exist or is empty, scan for example directories
52
  if not examples:
53
  for item in os.listdir(eval_path):
54
+ if os.path.isdir(os.path.join(eval_path, item)) and item.startswith(
55
+ "example_"
56
+ ):
57
+ example_id = item.replace("example_", "")
58
  example_dir = os.path.join(eval_path, item)
59
+
60
  # Find the first run and read task.txt
61
  run_dirs = []
62
  for run_item in os.listdir(example_dir):
63
  run_path = os.path.join(example_dir, run_item)
64
+ if os.path.isdir(run_path) and run_item.startswith("run_"):
65
  run_dirs.append(run_item)
66
+
67
  if run_dirs:
68
+ task_path = os.path.join(example_dir, run_dirs[0], "task.txt")
69
  if os.path.exists(task_path):
70
+ with open(task_path, "r") as f:
71
  examples[example_id] = f.read().strip()
72
  else:
73
  # If no task.txt, try reading from metadata.json
74
+ metadata_path = os.path.join(
75
+ example_dir, run_dirs[0], "metadata.json"
76
+ )
77
  if os.path.exists(metadata_path):
78
  try:
79
+ with open(metadata_path, "r") as f:
80
  metadata = json.load(f)
81
  # Look for task in summary[0].task
82
+ if (
83
+ "summary" in metadata
84
+ and metadata["summary"]
85
+ and "task" in metadata["summary"][0]
86
+ ):
87
+ examples[example_id] = metadata["summary"][0][
88
+ "task"
89
+ ]
90
  except:
91
  # Default to directory name if all else fails
92
  examples[example_id] = f"Task for {example_id}"
93
  else:
94
  examples[example_id] = f"Task for {example_id}"
95
+
96
  return jsonify(examples)
97
 
98
+
99
  # Get runs for an example
100
+ @app.route("/api/eval/<eval_id>/example/<example_id>/runs")
101
  def get_runs(eval_id, example_id):
102
+ base_dir = request.args.get("path", "./eval_results")
103
+ example_dir = os.path.join(base_dir, eval_id, f"example_{example_id}")
104
+
105
  if not os.path.exists(example_dir):
106
  return jsonify({"error": f"Example directory not found: {example_dir}"}), 404
107
+
108
  runs = []
109
  for item in os.listdir(example_dir):
110
  item_path = os.path.join(example_dir, item)
111
+ if os.path.isdir(item_path) and item.startswith("run_"):
112
  run_id = item
113
+
114
  # Try to get status from metadata.json
115
+ metadata_path = os.path.join(item_path, "metadata.json")
116
+ status = "unknown"
117
+
118
  if os.path.exists(metadata_path):
119
  try:
120
+ with open(metadata_path, "r") as f:
121
  metadata = json.load(f)
122
+ status = metadata.get("status", "unknown")
123
  except Exception as e:
124
+ app.logger.error(
125
+ f"Error reading metadata.json for {run_id}: {str(e)}"
126
+ )
127
+
128
+ runs.append({"id": run_id, "status": status})
129
  app.logger.info(f"runs: {runs}")
130
 
131
  return jsonify(runs)
132
 
133
+
134
  # Get metadata for a run
135
+ @app.route("/api/eval/<eval_id>/example/<example_id>/run/<run_id>/metadata")
136
  def get_metadata(eval_id, example_id, run_id):
137
+ base_dir = request.args.get("path", "./eval_results")
138
+ run_dir = os.path.join(base_dir, eval_id, f"example_{example_id}", run_id)
139
+ metadata_path = os.path.join(run_dir, "metadata.json")
140
  app.logger.info(f"metadata: {metadata_path}")
141
 
142
  if not os.path.exists(metadata_path):
143
  return jsonify({"error": "Metadata not found", "path": metadata_path}), 404
144
+
145
  try:
146
+ with open(metadata_path, "r") as f:
147
  metadata_content = f.read()
148
  if not metadata_content.strip():
149
  return jsonify({"error": "Metadata file is empty"}), 404
150
+
151
  metadata = json.loads(metadata_content)
152
  return jsonify(metadata)
153
  except json.JSONDecodeError as e:
154
  error_info = {
155
  "error": "Invalid JSON in metadata file",
156
  "details": str(e),
157
+ "path": metadata_path,
158
  }
159
  app.logger.error(f"JSON error in {metadata_path}: {str(e)}")
160
  return jsonify(error_info), 400
 
163
  "error": "Error reading metadata file",
164
  "details": str(e),
165
  "traceback": traceback.format_exc(),
166
+ "path": metadata_path,
167
  }
168
  app.logger.error(f"Error reading {metadata_path}: {str(e)}")
169
  return jsonify(error_info), 500
170
 
171
+
172
  # Get screenshots for a run
173
+ @app.route("/api/eval/<eval_id>/example/<example_id>/run/<run_id>/screenshots")
174
  def get_screenshots(eval_id, example_id, run_id):
175
+ base_dir = request.args.get("path", "./eval_results")
176
+ run_dir = os.path.join(base_dir, eval_id, f"example_{example_id}", run_id)
177
+
178
  if not os.path.exists(run_dir):
179
  return jsonify({"error": f"Run directory not found: {run_dir}"}), 404
180
+
181
  screenshots = []
182
+ for ext in ["png", "jpg", "jpeg"]:
183
+ pattern = os.path.join(run_dir, f"*.{ext}")
184
  for file_path in glob.glob(pattern):
185
  filename = os.path.basename(file_path)
186
+ screenshots.append(
187
+ {"name": filename, "path": f"/api/image?path={file_path}"}
188
+ )
189
+
 
190
  # Sort by filename
191
+ screenshots.sort(key=lambda x: x["name"])
192
 
193
  app.logger.info(f"screenshots: {screenshots}")
194
+
195
  return jsonify(screenshots)
196
 
197
+
198
  # Serve an image file
199
+ @app.route("/api/image")
200
  def get_image():
201
+ path = request.args.get("path")
202
  if not path:
203
  return jsonify({"error": "No path provided"}), 400
204
+
205
  if not os.path.exists(path):
206
  return jsonify({"error": f"Image not found at path: {path}"}), 404
207
+
208
  try:
209
  return send_file(path)
210
  except Exception as e:
211
  return jsonify({"error": f"Error serving image: {str(e)}"}), 500
212
 
213
+
214
+ if __name__ == "__main__":
215
  print("Evaluation Server is running at http://localhost:8000")
216
  print("Press Ctrl+C to stop the server")
217
+
218
+ app.run(debug=True, port=8000)