computer-agent

Running on CPU Upgrade

App Files Files Community

M-Rique commited on Apr 7

Commit

8770774

1 Parent(s): 64b82de

Add initial images and format

Browse files

Files changed (5) hide show

app.py +160 -100
e2bqwen.py +53 -28
eval.py +146 -84
model_replay.py +16 -21
show_eval.py +90 -71

app.py CHANGED Viewed

@@ -2,25 +2,23 @@ import gradio as gr
 import os
 import json
 import shutil
-import traceback
 import uuid
-from textwrap import dedent
 import time
 from threading import Timer
 from huggingface_hub import upload_folder, login
 from e2b_desktop import Sandbox
-from smolagents import CodeAgent, OpenAIServerModel
-from smolagents.monitoring import LogLevel
 from smolagents.gradio_ui import GradioUI, stream_to_gradio
-from model_replay import FakeModelReplayLog
-from gradio_modal import Modal
-from dotenv import load_dotenv
 load_dotenv(override=True)
-from e2bqwen import QwenVLAPIModel, E2BVisionAgent
 E2B_API_KEY = os.getenv("E2B_API_KEY")
 SANDBOXES = {}
@@ -28,11 +26,11 @@ SANDBOX_METADATA = {}
 SANDBOX_TIMEOUT = 600
 WIDTH = 1024
 HEIGHT = 768
-TMP_DIR = './tmp/'
 if not os.path.exists(TMP_DIR):
     os.makedirs(TMP_DIR)
-hf_token = os.getenv("HF_TOKEN")
 login(token=hf_token)
 custom_css = """
@@ -152,9 +150,9 @@ custom_css = """
 .logo-item:hover {
     color: #935f06!important;
 }
-""".replace("<<WIDTH>>", str(WIDTH+15)).replace("<<HEIGHT>>", str(HEIGHT+10))
-footer_html="""
 <h3 style="text-align: center; margin-top:50px;"><i>Powered by open source:</i></h2>
 <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
 <div class="logo-container">
@@ -182,7 +180,7 @@ sandbox_html_template = """
     <img src="https://huggingface.co/datasets/mfarre/servedfiles/resolve/main/blue_screen_of_death.gif" class="bsod-image" style="display: none;"/>
     <img src="https://huggingface.co/datasets/m-ric/images/resolve/main/HUD_thom.png" class="sandbox-frame" />
 </div>
-""".replace("<<WIDTH>>", str(WIDTH+15)).replace("<<HEIGHT>>", str(HEIGHT+10))
 custom_js = """function() {
     document.body.classList.add('dark');
@@ -301,13 +299,12 @@ custom_js = """function() {
 }
 """
-def upload_to_hf_and_remove(folder_path):
-    repo_id = "smolagents/computer-agent-logs"
     try:
         folder_name = os.path.basename(os.path.normpath(folder_path))
         # Upload the folder to Huggingface
         print(f"Uploading {folder_path} to {repo_id}/{folder_name}...")
         url = upload_folder(
@@ -315,29 +312,30 @@ def upload_to_hf_and_remove(folder_path):
             repo_id=repo_id,
             repo_type="dataset",
             path_in_repo=folder_name,
-            ignore_patterns=[".git/*", ".gitignore"]
         )
         # Remove the local folder after successful upload
         print(f"Upload complete. Removing local folder {folder_path}...")
         shutil.rmtree(folder_path)
         print("Local folder removed successfully.")
         return url
     except Exception as e:
         print(f"Error during upload or cleanup: {str(e)}")
         raise
 def cleanup_sandboxes():
     """Remove sandboxes that haven't been accessed for more than 5 minutes"""
     current_time = time.time()
     sandboxes_to_remove = []
     for session_id, metadata in SANDBOX_METADATA.items():
-        if current_time - metadata['last_accessed'] > SANDBOX_TIMEOUT:
             sandboxes_to_remove.append(session_id)
     for session_id in sandboxes_to_remove:
         if session_id in SANDBOXES:
             try:
@@ -345,7 +343,7 @@ def cleanup_sandboxes():
                 data_dir = os.path.join(TMP_DIR, session_id)
                 if os.path.exists(data_dir):
                     upload_to_hf_and_remove(data_dir)
                 # Close the sandbox
                 SANDBOXES[session_id].kill()
                 del SANDBOXES[session_id]
@@ -354,14 +352,18 @@ def cleanup_sandboxes():
             except Exception as e:
                 print(f"Error cleaning up sandbox {session_id}: {str(e)}")
 def get_or_create_sandbox(session_uuid):
     current_time = time.time()
-    if (session_uuid in SANDBOXES and
-        session_uuid in SANDBOX_METADATA and
-        current_time - SANDBOX_METADATA[session_uuid]['created_at'] < SANDBOX_TIMEOUT):
         print(f"Reusing Sandbox for  {session_uuid}")
-        SANDBOX_METADATA[session_uuid]['last_accessed'] = current_time
         return SANDBOXES[session_uuid]
     if session_uuid in SANDBOXES:
@@ -372,27 +374,38 @@ def get_or_create_sandbox(session_uuid):
             print(f"Error closing expired sandbox: {str(e)}")
     print(f"Creating new sandbox for session {session_uuid}")
-    desktop = Sandbox(api_key=E2B_API_KEY, resolution=(WIDTH, HEIGHT), dpi=96, timeout=SANDBOX_TIMEOUT)
     desktop.stream.start(require_auth=True)
     setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
     desktop.commands.run(setup_cmd)
     SANDBOXES[session_uuid] = desktop
     SANDBOX_METADATA[session_uuid] = {
-        'created_at': current_time,
-        'last_accessed': current_time
     }
     return desktop
 def update_html(interactive_mode: bool, session_uuid):
     desktop = get_or_create_sandbox(session_uuid)
     auth_key = desktop.stream.get_auth_key()
     base_url = desktop.stream.get_url(auth_key=auth_key)
     stream_url = base_url if interactive_mode else f"{base_url}&view_only=true"
     status_class = "status-interactive" if interactive_mode else "status-view-only"
     status_text = "Interactive" if interactive_mode else "Agent running..."
-    creation_time = SANDBOX_METADATA[session_uuid]['created_at'] if session_uuid in SANDBOX_METADATA else time.time()
     sandbox_html_content = sandbox_html_template.format(
         stream_url=stream_url,
@@ -406,24 +419,27 @@ def update_html(interactive_mode: bool, session_uuid):
 def generate_interaction_id(session_uuid):
     return f"{session_uuid}_{int(time.time())}"
 def chat_message_to_json(obj):
     """Custom JSON serializer for ChatMessage and related objects"""
-    if hasattr(obj, '__dict__'):
         # Create a copy of the object's __dict__ to avoid modifying the original
         result = obj.__dict__.copy()
         # Remove the 'raw' field which may contain non-serializable data
-        if 'raw' in result:
-            del result['raw']
         # Process the content or tool_calls if they exist
-        if 'content' in result and result['content'] is not None:
-            if hasattr(result['content'], '__dict__'):
-                result['content'] = chat_message_to_json(result['content'])
-        if 'tool_calls' in result and result['tool_calls'] is not None:
-            result['tool_calls'] = [chat_message_to_json(tc) for tc in result['tool_calls']]
         return result
     elif isinstance(obj, (list, tuple)):
         return [chat_message_to_json(item) for item in obj]
@@ -431,16 +447,23 @@ def chat_message_to_json(obj):
         return obj
-def save_final_status(folder, status: str, summary, error_message = None) -> None:
     metadata_path = os.path.join(folder, "metadata.json")
     output_file = open(metadata_path, "w")
-    output_file.write(json.dumps({"status":status, "summary":summary, "error_message": error_message}, default=chat_message_to_json))
     output_file.close()
 def extract_browser_uuid(js_uuid):
     print(f"[BROWSER] Got browser UUID from JS: {js_uuid}")
     return js_uuid
 def initialize_session(request: gr.Request, interactive_mode, browser_uuid):
     if not browser_uuid:
         new_uuid = str(uuid.uuid4())
@@ -454,7 +477,7 @@ def initialize_session(request: gr.Request, interactive_mode, browser_uuid):
 def create_agent(data_dir, desktop):
     model = QwenVLAPIModel(
         model_id="Qwen/Qwen2.5-VL-72B-Instruct",
-        hf_token = hf_token,
     )
     # model = OpenAIServerModel(
@@ -467,15 +490,17 @@ def create_agent(data_dir, desktop):
         max_steps=200,
         verbosity_level=2,
         # planning_interval=10,
-        use_v1_prompt=True
     )
 def get_agent_summary_erase_images(agent):
     for memory_step in agent.memory.steps:
         if getattr(memory_step, "observations_images", None):
             memory_step.observations_images = None
     return agent.memory.get_succinct_steps()
 class EnrichedGradioUI(GradioUI):
     def log_user_message(self, text_input):
         import gradio as gr
@@ -485,7 +510,15 @@ class EnrichedGradioUI(GradioUI):
             gr.Button(interactive=False),
         )
-    def interact_with_agent(self, task_input, stored_messages, session_state, session_uuid, consent_storage, request: gr.Request):
         interaction_id = generate_interaction_id(session_uuid)
         desktop = get_or_create_sandbox(session_uuid)
@@ -502,12 +535,30 @@ class EnrichedGradioUI(GradioUI):
             stored_messages.append(gr.ChatMessage(role="user", content=task_input))
             yield stored_messages
-            for msg in stream_to_gradio(session_state["agent"], task=task_input, reset_agent_memory=False):
-                if hasattr(session_state["agent"], "last_marked_screenshot") and msg.content == "-----": # Append the last screenshot before the end of step
-                    stored_messages.append(gr.ChatMessage(
-                        role="assistant",
-                        content={"path": session_state["agent"].last_marked_screenshot.to_string(), "mime_type": "image/png"},
-                    ))
                 stored_messages.append(msg)
                 yield stored_messages
@@ -516,37 +567,44 @@ class EnrichedGradioUI(GradioUI):
             #     summary = get_agent_summary_erase_images(session_state["agent"])
             #     save_final_status(data_dir, "completed", summary = summary)
             yield stored_messages
         except Exception as e:
-            error_message=f"Error in interaction: {str(e)}"
             raise e
             print(error_message)
-            stored_messages.append(gr.ChatMessage(role="assistant", content="Run failed:\n" + error_message))
             if consent_storage:
                 summary = get_agent_summary_erase_images(session_state["agent"])
-                save_final_status(data_dir, "failed", summary=summary, error_message=error_message)
             yield stored_messages
         finally:
             if consent_storage:
                 upload_to_hf_and_remove(data_dir)
-theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
 # Create a Gradio app with Blocks
 with gr.Blocks(theme=theme, css=custom_css, js=custom_js) as demo:
-    #Storing session hash in a state variable
     session_uuid_state = gr.State(None)
     with gr.Row():
         sandbox_html = gr.HTML(
             value=sandbox_html_template.format(
                 stream_url="",
                 status_class="status-interactive",
-                status_text="Interactive"
             ),
-            label="Output"
         )
         with gr.Sidebar(position="left"):
             with Modal(visible=True) as modal:
@@ -560,7 +618,7 @@ _Please note that we store the task logs by default so **do not write any person
             task_input = gr.Textbox(
                 value="Find me pictures of cute puppies",
                 label="Enter your task below:",
-                elem_classes="primary-color-label"
             )
             run_btn = gr.Button("Let's go!", variant="primary")
@@ -575,9 +633,9 @@ _Please note that we store the task logs by default so **do not write any person
                     "Go on the Hugging Face Hub, find the space for FLUX1.dev, then generate a picture of the Golden Gate bridge",
                     "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
                 ],
-                inputs = task_input,
-                label= "Example Tasks",
-                examples_per_page=4
             )
             session_state = gr.State({})
@@ -585,7 +643,9 @@ _Please note that we store the task logs by default so **do not write any person
             minimalist_toggle = gr.Checkbox(label="Innie/Outie", value=False)
-            consent_storage = gr.Checkbox(label="Store task and agent trace?", value=True)
             def apply_theme(minimalist_mode: bool):
                 if not minimalist_mode:
@@ -631,16 +691,10 @@ _Please note that we store the task logs by default so **do not write any person
             # Hidden HTML element to inject CSS dynamically
             theme_styles = gr.HTML(apply_theme(False), visible=False)
             minimalist_toggle.change(
-                fn=apply_theme,
-                inputs=[minimalist_toggle],
-                outputs=[theme_styles]
-            )
-            footer = gr.HTML(
-                value=footer_html,
-                label="Header"
             )
     chatbot_display = gr.Chatbot(
         elem_id="chatbot",
@@ -653,7 +707,9 @@ _Please note that we store the task logs by default so **do not write any person
         resizable=True,
     )
-    agent_ui = EnrichedGradioUI(CodeAgent(tools=[], model=None, name="ok", description="ok"))
     stop_btn = gr.Button("Stop the agent!", variant="huggingface")
@@ -664,9 +720,9 @@ _Please note that we store the task logs by default so **do not write any person
         if not os.path.exists(log_file):
             return "Waiting for machine from the future to boot..."
         try:
-            with open(log_file, 'r') as f:
                 lines = f.readlines()
                 return "".join(lines[-tail:] if len(lines) > tail else lines)
         except Exception as e:
@@ -685,21 +741,25 @@ _Please note that we store the task logs by default so **do not write any person
     is_interactive = gr.Checkbox(value=True, visible=False)
     # Chain the events
-    run_event = run_btn.click(
-        fn=clear_and_set_view_only,
-        inputs=[task_input, session_uuid_state],
-        outputs=[sandbox_html]
-    ).then(
-        agent_ui.interact_with_agent,
-        inputs=[task_input, stored_messages, session_state, session_uuid_state, consent_storage],
-        outputs=[chatbot_display]
-    ).then(
-        fn=set_interactive,
-        inputs=[session_uuid_state],
-        outputs=[sandbox_html]
-    ).then(
-        fn=reactivate_stop_btn,
-        outputs=[stop_btn]
     )
     def interrupt_agent(session_state):
@@ -716,7 +776,7 @@ _Please note that we store the task logs by default so **do not write any person
     # replay_btn.click(
     #     fn=clear_and_set_view_only,
-    #     inputs=[task_input],
     #     outputs=[sandbox_html]
     # ).then(
     #     set_logs_source,
@@ -744,4 +804,4 @@ _Please note that we store the task logs by default so **do not write any person
 # Launch the app
 if __name__ == "__main__":
     Timer(60, cleanup_sandboxes).start()  # Run every minute
-    demo.launch()

 import os
 import json
 import shutil
 import uuid
 import time
 from threading import Timer
 from huggingface_hub import upload_folder, login
 from e2b_desktop import Sandbox
+from gradio_modal import Modal
+from io import BytesIO
+from PIL import Image
+from dotenv import load_dotenv
+from smolagents import CodeAgent
 from smolagents.gradio_ui import GradioUI, stream_to_gradio
+from e2bqwen import QwenVLAPIModel, E2BVisionAgent
 load_dotenv(override=True)
 E2B_API_KEY = os.getenv("E2B_API_KEY")
 SANDBOXES = {}
 SANDBOX_TIMEOUT = 600
 WIDTH = 1024
 HEIGHT = 768
+TMP_DIR = "./tmp/"
 if not os.path.exists(TMP_DIR):
     os.makedirs(TMP_DIR)
+hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
 login(token=hf_token)
 custom_css = """
 .logo-item:hover {
     color: #935f06!important;
 }
+""".replace("<<WIDTH>>", str(WIDTH + 15)).replace("<<HEIGHT>>", str(HEIGHT + 10))
+footer_html = """
 <h3 style="text-align: center; margin-top:50px;"><i>Powered by open source:</i></h2>
 <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
 <div class="logo-container">
     <img src="https://huggingface.co/datasets/mfarre/servedfiles/resolve/main/blue_screen_of_death.gif" class="bsod-image" style="display: none;"/>
     <img src="https://huggingface.co/datasets/m-ric/images/resolve/main/HUD_thom.png" class="sandbox-frame" />
 </div>
+""".replace("<<WIDTH>>", str(WIDTH + 15)).replace("<<HEIGHT>>", str(HEIGHT + 10))
 custom_js = """function() {
     document.body.classList.add('dark');
 }
 """
+def upload_to_hf_and_remove(folder_path):
+    repo_id = "smolagents/computer-agent-logs"
     try:
         folder_name = os.path.basename(os.path.normpath(folder_path))
         # Upload the folder to Huggingface
         print(f"Uploading {folder_path} to {repo_id}/{folder_name}...")
         url = upload_folder(
             repo_id=repo_id,
             repo_type="dataset",
             path_in_repo=folder_name,
+            ignore_patterns=[".git/*", ".gitignore"],
         )
         # Remove the local folder after successful upload
         print(f"Upload complete. Removing local folder {folder_path}...")
         shutil.rmtree(folder_path)
         print("Local folder removed successfully.")
         return url
     except Exception as e:
         print(f"Error during upload or cleanup: {str(e)}")
         raise
 def cleanup_sandboxes():
     """Remove sandboxes that haven't been accessed for more than 5 minutes"""
     current_time = time.time()
     sandboxes_to_remove = []
     for session_id, metadata in SANDBOX_METADATA.items():
+        if current_time - metadata["last_accessed"] > SANDBOX_TIMEOUT:
             sandboxes_to_remove.append(session_id)
     for session_id in sandboxes_to_remove:
         if session_id in SANDBOXES:
             try:
                 data_dir = os.path.join(TMP_DIR, session_id)
                 if os.path.exists(data_dir):
                     upload_to_hf_and_remove(data_dir)
                 # Close the sandbox
                 SANDBOXES[session_id].kill()
                 del SANDBOXES[session_id]
             except Exception as e:
                 print(f"Error cleaning up sandbox {session_id}: {str(e)}")
 def get_or_create_sandbox(session_uuid):
     current_time = time.time()
+    if (
+        session_uuid in SANDBOXES
+        and session_uuid in SANDBOX_METADATA
+        and current_time - SANDBOX_METADATA[session_uuid]["created_at"]
+        < SANDBOX_TIMEOUT
+    ):
         print(f"Reusing Sandbox for  {session_uuid}")
+        SANDBOX_METADATA[session_uuid]["last_accessed"] = current_time
         return SANDBOXES[session_uuid]
     if session_uuid in SANDBOXES:
             print(f"Error closing expired sandbox: {str(e)}")
     print(f"Creating new sandbox for session {session_uuid}")
+    desktop = Sandbox(
+        api_key=E2B_API_KEY,
+        resolution=(WIDTH, HEIGHT),
+        dpi=96,
+        timeout=SANDBOX_TIMEOUT,
+        template="k0wmnzir0zuzye6dndlw",
+    )
     desktop.stream.start(require_auth=True)
     setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
     desktop.commands.run(setup_cmd)
     SANDBOXES[session_uuid] = desktop
     SANDBOX_METADATA[session_uuid] = {
+        "created_at": current_time,
+        "last_accessed": current_time,
     }
     return desktop
 def update_html(interactive_mode: bool, session_uuid):
     desktop = get_or_create_sandbox(session_uuid)
     auth_key = desktop.stream.get_auth_key()
     base_url = desktop.stream.get_url(auth_key=auth_key)
     stream_url = base_url if interactive_mode else f"{base_url}&view_only=true"
     status_class = "status-interactive" if interactive_mode else "status-view-only"
     status_text = "Interactive" if interactive_mode else "Agent running..."
+    creation_time = (
+        SANDBOX_METADATA[session_uuid]["created_at"]
+        if session_uuid in SANDBOX_METADATA
+        else time.time()
+    )
     sandbox_html_content = sandbox_html_template.format(
         stream_url=stream_url,
 def generate_interaction_id(session_uuid):
     return f"{session_uuid}_{int(time.time())}"
 def chat_message_to_json(obj):
     """Custom JSON serializer for ChatMessage and related objects"""
+    if hasattr(obj, "__dict__"):
         # Create a copy of the object's __dict__ to avoid modifying the original
         result = obj.__dict__.copy()
         # Remove the 'raw' field which may contain non-serializable data
+        if "raw" in result:
+            del result["raw"]
         # Process the content or tool_calls if they exist
+        if "content" in result and result["content"] is not None:
+            if hasattr(result["content"], "__dict__"):
+                result["content"] = chat_message_to_json(result["content"])
+        if "tool_calls" in result and result["tool_calls"] is not None:
+            result["tool_calls"] = [
+                chat_message_to_json(tc) for tc in result["tool_calls"]
+            ]
         return result
     elif isinstance(obj, (list, tuple)):
         return [chat_message_to_json(item) for item in obj]
         return obj
+def save_final_status(folder, status: str, summary, error_message=None) -> None:
     metadata_path = os.path.join(folder, "metadata.json")
     output_file = open(metadata_path, "w")
+    output_file.write(
+        json.dumps(
+            {"status": status, "summary": summary, "error_message": error_message},
+            default=chat_message_to_json,
+        )
+    )
     output_file.close()
 def extract_browser_uuid(js_uuid):
     print(f"[BROWSER] Got browser UUID from JS: {js_uuid}")
     return js_uuid
 def initialize_session(request: gr.Request, interactive_mode, browser_uuid):
     if not browser_uuid:
         new_uuid = str(uuid.uuid4())
 def create_agent(data_dir, desktop):
     model = QwenVLAPIModel(
         model_id="Qwen/Qwen2.5-VL-72B-Instruct",
+        hf_token=hf_token,
     )
     # model = OpenAIServerModel(
         max_steps=200,
         verbosity_level=2,
         # planning_interval=10,
+        use_v1_prompt=True,
     )
 def get_agent_summary_erase_images(agent):
     for memory_step in agent.memory.steps:
         if getattr(memory_step, "observations_images", None):
             memory_step.observations_images = None
     return agent.memory.get_succinct_steps()
 class EnrichedGradioUI(GradioUI):
     def log_user_message(self, text_input):
         import gradio as gr
             gr.Button(interactive=False),
         )
+    def interact_with_agent(
+        self,
+        task_input,
+        stored_messages,
+        session_state,
+        session_uuid,
+        consent_storage,
+        request: gr.Request,
+    ):
         interaction_id = generate_interaction_id(session_uuid)
         desktop = get_or_create_sandbox(session_uuid)
             stored_messages.append(gr.ChatMessage(role="user", content=task_input))
             yield stored_messages
+            screenshot_bytes = session_state["agent"].desktop.screenshot(format="bytes")
+            initial_screenshot = Image.open(BytesIO(screenshot_bytes))
+            for msg in stream_to_gradio(
+                session_state["agent"],
+                task=task_input,
+                task_images=[initial_screenshot],
+                reset_agent_memory=False,
+            ):
+                if (
+                    hasattr(session_state["agent"], "last_marked_screenshot")
+                    and msg.content == "-----"
+                ):  # Append the last screenshot before the end of step
+                    stored_messages.append(
+                        gr.ChatMessage(
+                            role="assistant",
+                            content={
+                                "path": session_state[
+                                    "agent"
+                                ].last_marked_screenshot.to_string(),
+                                "mime_type": "image/png",
+                            },
+                        )
+                    )
                 stored_messages.append(msg)
                 yield stored_messages
             #     summary = get_agent_summary_erase_images(session_state["agent"])
             #     save_final_status(data_dir, "completed", summary = summary)
             yield stored_messages
         except Exception as e:
+            error_message = f"Error in interaction: {str(e)}"
             raise e
             print(error_message)
+            stored_messages.append(
+                gr.ChatMessage(
+                    role="assistant", content="Run failed:\n" + error_message
+                )
+            )
             if consent_storage:
                 summary = get_agent_summary_erase_images(session_state["agent"])
+                save_final_status(
+                    data_dir, "failed", summary=summary, error_message=error_message
+                )
             yield stored_messages
         finally:
             if consent_storage:
                 upload_to_hf_and_remove(data_dir)
+theme = gr.themes.Default(
+    font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue"
+)
 # Create a Gradio app with Blocks
 with gr.Blocks(theme=theme, css=custom_css, js=custom_js) as demo:
+    # Storing session hash in a state variable
     session_uuid_state = gr.State(None)
     with gr.Row():
         sandbox_html = gr.HTML(
             value=sandbox_html_template.format(
                 stream_url="",
                 status_class="status-interactive",
+                status_text="Interactive",
             ),
+            label="Output",
         )
         with gr.Sidebar(position="left"):
             with Modal(visible=True) as modal:
             task_input = gr.Textbox(
                 value="Find me pictures of cute puppies",
                 label="Enter your task below:",
+                elem_classes="primary-color-label",
             )
             run_btn = gr.Button("Let's go!", variant="primary")
                     "Go on the Hugging Face Hub, find the space for FLUX1.dev, then generate a picture of the Golden Gate bridge",
                     "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
                 ],
+                inputs=task_input,
+                label="Example Tasks",
+                examples_per_page=4,
             )
             session_state = gr.State({})
             minimalist_toggle = gr.Checkbox(label="Innie/Outie", value=False)
+            consent_storage = gr.Checkbox(
+                label="Store task and agent trace?", value=True
+            )
             def apply_theme(minimalist_mode: bool):
                 if not minimalist_mode:
             # Hidden HTML element to inject CSS dynamically
             theme_styles = gr.HTML(apply_theme(False), visible=False)
             minimalist_toggle.change(
+                fn=apply_theme, inputs=[minimalist_toggle], outputs=[theme_styles]
             )
+            footer = gr.HTML(value=footer_html, label="Header")
     chatbot_display = gr.Chatbot(
         elem_id="chatbot",
         resizable=True,
     )
+    agent_ui = EnrichedGradioUI(
+        CodeAgent(tools=[], model=None, name="ok", description="ok")
+    )
     stop_btn = gr.Button("Stop the agent!", variant="huggingface")
         if not os.path.exists(log_file):
             return "Waiting for machine from the future to boot..."
         try:
+            with open(log_file, "r") as f:
                 lines = f.readlines()
                 return "".join(lines[-tail:] if len(lines) > tail else lines)
         except Exception as e:
     is_interactive = gr.Checkbox(value=True, visible=False)
     # Chain the events
+    run_event = (
+        run_btn.click(
+            fn=clear_and_set_view_only,
+            inputs=[task_input, session_uuid_state],
+            outputs=[sandbox_html],
+        )
+        .then(
+            agent_ui.interact_with_agent,
+            inputs=[
+                task_input,
+                stored_messages,
+                session_state,
+                session_uuid_state,
+                consent_storage,
+            ],
+            outputs=[chatbot_display],
+        )
+        .then(fn=set_interactive, inputs=[session_uuid_state], outputs=[sandbox_html])
+        .then(fn=reactivate_stop_btn, outputs=[stop_btn])
     )
     def interrupt_agent(session_state):
     # replay_btn.click(
     #     fn=clear_and_set_view_only,
+    #     inputs=[task_input],
     #     outputs=[sandbox_html]
     # ).then(
     #     set_logs_source,
 # Launch the app
 if __name__ == "__main__":
     Timer(60, cleanup_sandboxes).start()  # Run every minute
+    demo.launch()

e2bqwen.py CHANGED Viewed

@@ -10,7 +10,7 @@ from PIL import Image
 # SmolaAgents imports
 from smolagents import CodeAgent, tool, HfApiModel
-from smolagents.memory import ActionStep
 from smolagents.models import ChatMessage, Model
 from smolagents.agents import populate_template
 from smolagents.monitoring import LogLevel
@@ -144,6 +144,7 @@ NEVER CLICK THE WEB BROWSER ICON TO OPEN THE WEB BROWSER: use open_url
 </general_guidelines>
 """
 def draw_marker_on_image(image_copy, click_coordinates):
     x, y = click_coordinates
     draw = ImageDraw.Draw(image_copy)
@@ -152,12 +153,22 @@ def draw_marker_on_image(image_copy, click_coordinates):
     draw.line((x - cross_size, y, x + cross_size, y), fill="green", width=linewidth)
     draw.line((x, y - cross_size, x, y + cross_size), fill="green", width=linewidth)
     # Add a circle around it for better visibility
-    draw.ellipse((x - cross_size * 2, y - cross_size * 2, x + cross_size * 2, y + cross_size * 2), outline="green", width=linewidth)
     return image_copy
 class E2BVisionAgent(CodeAgent):
     """Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
     def __init__(
         self,
         model: HfApiModel,
@@ -168,7 +179,7 @@ class E2BVisionAgent(CodeAgent):
         verbosity_level: LogLevel = 2,
         planning_interval: int = None,
         use_v1_prompt: bool = False,
-        **kwargs
     ):
         self.desktop = desktop
         self.data_dir = data_dir
@@ -188,10 +199,12 @@ class E2BVisionAgent(CodeAgent):
             model=model,
             max_steps=max_steps,
             verbosity_level=verbosity_level,
-            planning_interval = self.planning_interval,
-            **kwargs
         )
-        self.prompt_templates["system_prompt"] = E2B_SYSTEM_PROMPT_TEMPLATE.replace("<<resolution_x>>", str(self.width)).replace("<<resolution_y>>", str(self.height))
         # Add screen info to state
         self.state["screen_width"] = self.width
@@ -203,7 +216,7 @@ class E2BVisionAgent(CodeAgent):
         self.step_callbacks.append(self.take_screenshot_callback)
     def initialize_system_prompt(self) -> str:
-        if True:
             return """You are a desktop automation assistant that can control a remote desktop environment.
 You only have access to the following tools to interact with the desktop, no additional ones:
 - click(x, y): Performs a left-click at the specified coordinates
@@ -282,11 +295,14 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
                     ),
                 },
             )
-            assert system_prompt != self.prompt_templates["system_prompt"], "Populating prompt template failed"
             return system_prompt
     def _setup_desktop_tools(self):
         """Register all desktop tools"""
         @tool
         def click(x: int, y: int) -> str:
             """
@@ -342,7 +358,11 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
             return f"Moved mouse to coordinates ({x}, {y})"
         def normalize_text(text):
-            return ''.join(c for c in unicodedata.normalize('NFD', text) if not unicodedata.combining(c))
         @tool
         def type_text(text: str) -> str:
@@ -469,7 +489,6 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
         self.tools["drag_and_drop"] = drag_and_drop
         self.tools["find_on_page_ctrl_f"] = find_on_page_ctrl_f
     def take_screenshot_callback(self, memory_step: ActionStep, agent=None) -> None:
         """Callback that takes a screenshot + memory snapshot after a step completes"""
         self.logger.log("Analyzing screen content...")
@@ -493,21 +512,31 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
         self.last_marked_screenshot = AgentImage(screenshot_path)
         print(f"Saved screenshot for step {current_step} to {screenshot_path}")
-        for (
-            previous_memory_step
-        ) in agent.memory.steps:  # Remove previous screenshots from logs for lean processing
             if (
                 isinstance(previous_memory_step, ActionStep)
                 and previous_memory_step.step_number <= current_step - 1
             ):
                 previous_memory_step.observations_images = None
             if (
                 isinstance(previous_memory_step, ActionStep)
                 and previous_memory_step.step_number == current_step - 1
             ):
-                if previous_memory_step.tool_calls and getattr(previous_memory_step.tool_calls[0], "arguments", None) and memory_step.tool_calls and getattr(memory_step.tool_calls[0], "arguments", None):
-                    if previous_memory_step.tool_calls[0].arguments == memory_step.tool_calls[0].arguments:
                         memory_step.observations += "\nWARNING: You've executed the same action several times in a row. MAKE SURE TO NOT UNNECESSARILY REPEAT ACTIONS."
         # Add the marker-edited image to the current memory step
@@ -515,8 +544,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
         # memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
-        self.click_coordinates = None # Reset click marker
     def close(self):
         """Clean up resources"""
@@ -529,9 +557,9 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
 class QwenVLAPIModel(Model):
     """Model wrapper for Qwen2.5VL API with fallback mechanism"""
     def __init__(
-        self,
         model_id: str = "Qwen/Qwen2.5-VL-72B-Instruct",
         hf_token: str = None,
     ):
@@ -548,25 +576,22 @@ class QwenVLAPIModel(Model):
             token=hf_token,
             max_tokens=4096,
         )
     def __call__(
-        self,
-        messages: List[Dict[str, Any]],
-        stop_sequences: Optional[List[str]] = None,
-        **kwargs
     ) -> ChatMessage:
         try:
             message = self.base_model(messages, stop_sequences, **kwargs)
             return message
         except Exception as e:
-            raise e
             print(f"Base model failed with error: {e}. Calling fallback model.")
         # Continue to fallback
         try:
             message = self.fallback_model(messages, stop_sequences, **kwargs)
             return message
         except Exception as e:
-            raise e
             raise Exception(f"Both endpoints failed. Last error: {e}")

 # SmolaAgents imports
 from smolagents import CodeAgent, tool, HfApiModel
+from smolagents.memory import ActionStep, TaskStep
 from smolagents.models import ChatMessage, Model
 from smolagents.agents import populate_template
 from smolagents.monitoring import LogLevel
 </general_guidelines>
 """
 def draw_marker_on_image(image_copy, click_coordinates):
     x, y = click_coordinates
     draw = ImageDraw.Draw(image_copy)
     draw.line((x - cross_size, y, x + cross_size, y), fill="green", width=linewidth)
     draw.line((x, y - cross_size, x, y + cross_size), fill="green", width=linewidth)
     # Add a circle around it for better visibility
+    draw.ellipse(
+        (
+            x - cross_size * 2,
+            y - cross_size * 2,
+            x + cross_size * 2,
+            y + cross_size * 2,
+        ),
+        outline="green",
+        width=linewidth,
+    )
     return image_copy
 class E2BVisionAgent(CodeAgent):
     """Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
     def __init__(
         self,
         model: HfApiModel,
         verbosity_level: LogLevel = 2,
         planning_interval: int = None,
         use_v1_prompt: bool = False,
+        **kwargs,
     ):
         self.desktop = desktop
         self.data_dir = data_dir
             model=model,
             max_steps=max_steps,
             verbosity_level=verbosity_level,
+            planning_interval=self.planning_interval,
+            **kwargs,
         )
+        self.prompt_templates["system_prompt"] = E2B_SYSTEM_PROMPT_TEMPLATE.replace(
+            "<<resolution_x>>", str(self.width)
+        ).replace("<<resolution_y>>", str(self.height))
         # Add screen info to state
         self.state["screen_width"] = self.width
         self.step_callbacks.append(self.take_screenshot_callback)
     def initialize_system_prompt(self) -> str:
+        if False:
             return """You are a desktop automation assistant that can control a remote desktop environment.
 You only have access to the following tools to interact with the desktop, no additional ones:
 - click(x, y): Performs a left-click at the specified coordinates
                     ),
                 },
             )
+            assert system_prompt != self.prompt_templates["system_prompt"], (
+                "Populating prompt template failed"
+            )
             return system_prompt
     def _setup_desktop_tools(self):
         """Register all desktop tools"""
         @tool
         def click(x: int, y: int) -> str:
             """
             return f"Moved mouse to coordinates ({x}, {y})"
         def normalize_text(text):
+            return "".join(
+                c
+                for c in unicodedata.normalize("NFD", text)
+                if not unicodedata.combining(c)
+            )
         @tool
         def type_text(text: str) -> str:
         self.tools["drag_and_drop"] = drag_and_drop
         self.tools["find_on_page_ctrl_f"] = find_on_page_ctrl_f
     def take_screenshot_callback(self, memory_step: ActionStep, agent=None) -> None:
         """Callback that takes a screenshot + memory snapshot after a step completes"""
         self.logger.log("Analyzing screen content...")
         self.last_marked_screenshot = AgentImage(screenshot_path)
         print(f"Saved screenshot for step {current_step} to {screenshot_path}")
+        for previous_memory_step in (
+            agent.memory.steps
+        ):  # Remove previous screenshots from logs for lean processing
             if (
                 isinstance(previous_memory_step, ActionStep)
                 and previous_memory_step.step_number <= current_step - 1
             ):
                 previous_memory_step.observations_images = None
+            elif isinstance(previous_memory_step, TaskStep):
+                previous_memory_step.observations_images = None
             if (
                 isinstance(previous_memory_step, ActionStep)
                 and previous_memory_step.step_number == current_step - 1
             ):
+                if (
+                    previous_memory_step.tool_calls
+                    and getattr(previous_memory_step.tool_calls[0], "arguments", None)
+                    and memory_step.tool_calls
+                    and getattr(memory_step.tool_calls[0], "arguments", None)
+                ):
+                    if (
+                        previous_memory_step.tool_calls[0].arguments
+                        == memory_step.tool_calls[0].arguments
+                    ):
                         memory_step.observations += "\nWARNING: You've executed the same action several times in a row. MAKE SURE TO NOT UNNECESSARILY REPEAT ACTIONS."
         # Add the marker-edited image to the current memory step
         # memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
+        self.click_coordinates = None  # Reset click marker
     def close(self):
         """Clean up resources"""
 class QwenVLAPIModel(Model):
     """Model wrapper for Qwen2.5VL API with fallback mechanism"""
     def __init__(
+        self,
         model_id: str = "Qwen/Qwen2.5-VL-72B-Instruct",
         hf_token: str = None,
     ):
             token=hf_token,
             max_tokens=4096,
         )
     def __call__(
+        self,
+        messages: List[Dict[str, Any]],
+        stop_sequences: Optional[List[str]] = None,
+        **kwargs,
     ) -> ChatMessage:
         try:
             message = self.base_model(messages, stop_sequences, **kwargs)
             return message
         except Exception as e:
             print(f"Base model failed with error: {e}. Calling fallback model.")
         # Continue to fallback
         try:
             message = self.fallback_model(messages, stop_sequences, **kwargs)
             return message
         except Exception as e:
             raise Exception(f"Both endpoints failed. Last error: {e}")

eval.py CHANGED Viewed

@@ -1,19 +1,14 @@
 import os
 import json
-import shutil
-import time
 import argparse
 import subprocess
-import traceback
 import threading
 import concurrent.futures
 from datetime import datetime
-from threading import Timer
 from e2b_desktop import Sandbox
 from huggingface_hub import get_token
-from smolagents import CodeAgent, OpenAIServerModel
-from smolagents.monitoring import LogLevel
 from e2bqwen import QwenVLAPIModel, E2BVisionAgent
 from dotenv import load_dotenv
@@ -27,7 +22,9 @@ try:
     if not HUGGINGFACE_API_KEY:
         HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
         if not HUGGINGFACE_API_KEY:
-            raise ValueError("No Hugging Face token found. Please login with `huggingface-cli login` or set HUGGINGFACE_API_KEY environment variable")
 except ImportError:
     # Fall back if huggingface_hub is old version without get_token
     HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
@@ -38,24 +35,29 @@ SANDBOX_TIMEOUT = 600  # 10 minutes
 # Thread lock for print statements to avoid garbled output
 print_lock = threading.Lock()
 def thread_safe_print(*args, **kwargs):
     """Thread-safe print function"""
     with print_lock:
         print(*args, **kwargs)
 # Get git hash for folder naming
 def get_git_hash():
     try:
-        result = subprocess.run(['git', 'rev-parse', '--short', 'HEAD'],
-                                stdout=subprocess.PIPE,
-                                stderr=subprocess.PIPE,
-                                text=True)
         if result.returncode == 0:
             return result.stdout.strip()
         return "nogit"
     except:
         return "nogit"
 def create_agent(data_dir, desktop, max_steps: int):
     """Create an agent with the E2B desktop sandbox"""
     model = QwenVLAPIModel(
@@ -75,6 +77,7 @@ def create_agent(data_dir, desktop, max_steps: int):
         planning_interval=10,
     )
 def get_agent_summary_erase_images(agent):
     """Get agent summary and erase images to save space"""
     for memory_step in agent.memory.steps:
@@ -82,82 +85,104 @@ def get_agent_summary_erase_images(agent):
             memory_step.observations_images = None
     return agent.memory.get_succinct_steps()
 def chat_message_to_json(obj):
     """Custom JSON serializer for ChatMessage and related objects"""
-    if hasattr(obj, '__dict__'):
         # Create a copy of the object's __dict__ to avoid modifying the original
         result = obj.__dict__.copy()
         # Remove the 'raw' field which may contain non-serializable data
-        if 'raw' in result:
-            del result['raw']
         # Process the content or tool_calls if they exist
-        if 'content' in result and result['content'] is not None:
-            if hasattr(result['content'], '__dict__'):
-                result['content'] = chat_message_to_json(result['content'])
-        if 'tool_calls' in result and result['tool_calls'] is not None:
-            result['tool_calls'] = [chat_message_to_json(tc) for tc in result['tool_calls']]
         return result
     elif isinstance(obj, (list, tuple)):
         return [chat_message_to_json(item) for item in obj]
     else:
         return obj
 def save_final_status(folder, status: str, summary, error_message=None) -> None:
     """Save metadata about the run"""
     metadata_path = os.path.join(folder, "metadata.json")
     with open(metadata_path, "w") as output_file:
-        output_file.write(json.dumps({
-            "status": status,
-            "summary": summary,
-            "error_message": error_message
-        }, default=chat_message_to_json))
 def run_example_once(example_name, example_text, run_index, example_dir, max_steps):
     """Run a single example once and return the result"""
     run_dir = os.path.join(example_dir, f"run_{run_index}")
     os.makedirs(run_dir, exist_ok=True)
     # Save the example text
     with open(os.path.join(run_dir, "task.txt"), "w") as f:
         f.write(example_text)
     thread_safe_print(f"  Starting run {run_index} for example '{example_name}'")
     # Create a new sandbox for this run
     desktop = None
     try:
         desktop = Sandbox(
-            api_key=E2B_API_KEY,
-            resolution=(WIDTH, HEIGHT),
-            dpi=96,
-            timeout=SANDBOX_TIMEOUT
         )
         # Initialize the desktop environment
         setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
         desktop.commands.run(setup_cmd)
         # Create and run the agent
         agent = create_agent(data_dir=run_dir, desktop=desktop, max_steps=max_steps)
         try:
-            agent.run(task=example_text)
             summary = get_agent_summary_erase_images(agent)
             save_final_status(run_dir, "completed", summary=summary)
-            thread_safe_print(f"  ✓ Example '{example_name}' run {run_index} completed successfully")
             result = {"status": "completed", "run_dir": run_dir}
         except Exception as e:
             error_message = f"Error in agent execution: {str(e)}"
-            thread_safe_print(f"  ✗ Example '{example_name}' run {run_index} failed: {error_message}")
-            summary = get_agent_summary_erase_images(agent) if hasattr(agent, 'memory') else None
-            save_final_status(run_dir, "failed", summary=summary, error_message=error_message)
             result = {"status": "failed", "run_dir": run_dir, "error": error_message}
     except Exception as e:
         error_message = f"Error setting up sandbox: {str(e)}"
-        thread_safe_print(f"  ✗ Example '{example_name}' run {run_index} failed: {error_message}")
         save_final_status(run_dir, "failed", summary=None, error_message=error_message)
         result = {"status": "failed", "run_dir": run_dir, "error": error_message}
     finally:
@@ -167,21 +192,24 @@ def run_example_once(example_name, example_text, run_index, example_dir, max_ste
                 desktop.kill()
             except:
                 pass
     return result
 def run_example(example_name, example_text, num_runs, example_dir, max_steps):
     """Run a single example multiple times using threads for each run"""
     thread_safe_print(f"\nRunning example '{example_name}': '{example_text[:50]}...'")
     results = []
     with concurrent.futures.ThreadPoolExecutor(max_workers=num_runs) as executor:
         # Submit all runs to the executor
         future_to_run = {
-            executor.submit(run_example_once, example_name, example_text, j, example_dir, max_steps): j
             for j in range(num_runs)
         }
         # Collect results as they complete
         for future in concurrent.futures.as_completed(future_to_run):
             run_index = future_to_run[future]
@@ -189,31 +217,34 @@ def run_example(example_name, example_text, num_runs, example_dir, max_steps):
                 result = future.result()
                 results.append(result)
             except Exception as exc:
-                thread_safe_print(f"  ✗ Run {run_index} for '{example_name}' generated an exception: {exc}")
-                results.append({
-                    "status": "error",
-                    "run_index": run_index,
-                    "error": str(exc)
-                })
     return results
 def run_evaluation(examples, num_runs, output_dir, max_parallel, max_steps):
     """Run each example n times and save the results"""
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     git_hash = get_git_hash()
     eval_dir = os.path.join(output_dir, f"eval_{timestamp}_{git_hash}")
     os.makedirs(eval_dir, exist_ok=True)
     thread_safe_print(f"Starting evaluation. Results will be saved to: {eval_dir}")
-    thread_safe_print(f"Will run {len(examples)} examples, {num_runs} times each, with {max_parallel} parallel examples")
     # Save examples to the evaluation directory
     with open(os.path.join(eval_dir, "examples.json"), "w") as f:
         json.dump(examples, f, indent=2)
     all_results = {}
     # Run examples in parallel, but limit the number of parallel examples
     with concurrent.futures.ThreadPoolExecutor(max_workers=max_parallel) as executor:
         # Prepare the example directories first
@@ -222,66 +253,94 @@ def run_evaluation(examples, num_runs, output_dir, max_parallel, max_steps):
             example_dir = os.path.join(eval_dir, f"example_{example_name}")
             os.makedirs(example_dir, exist_ok=True)
             example_dirs[example_name] = example_dir
         # Submit all examples to the executor
         future_to_example = {
-            executor.submit(run_example, example_name, example_text, num_runs, example_dirs[example_name], max_steps): example_name
             for example_name, example_text in examples.items()
         }
         # Collect results as they complete
         for future in concurrent.futures.as_completed(future_to_example):
             example_name = future_to_example[future]
             try:
                 results = future.result()
                 all_results[example_name] = results
                 # Calculate success rate for this example
                 success_count = sum(1 for r in results if r["status"] == "completed")
-                thread_safe_print(f"Example '{example_name}' complete: {success_count}/{num_runs} successful runs ({success_count/num_runs*100:.1f}%)")
             except Exception as exc:
-                thread_safe_print(f"Example '{example_name}' generated an exception: {exc}")
                 all_results[example_name] = [{"status": "error", "error": str(exc)}]
     # Calculate overall results and success rates
     success_counts = {
         example_name: sum(1 for r in results if r["status"] == "completed")
         for example_name, results in all_results.items()
     }
     total_runs = sum(len(results) for results in all_results.values())
     total_successes = sum(success_counts.values())
     # Save summary to evaluation directory
     summary = {
         "total_runs": total_runs,
         "total_successes": total_successes,
         "success_rate": total_successes / total_runs if total_runs > 0 else 0,
         "example_success_rates": {
-            example_name: success_counts[example_name] / len(all_results[example_name])
             for example_name in examples
-        }
     }
     with open(os.path.join(eval_dir, "summary.json"), "w") as f:
         json.dump(summary, f, indent=2)
     thread_safe_print(f"\nEvaluation complete. Results saved to: {eval_dir}")
-    thread_safe_print(f"Overall success rate: {summary['success_rate']*100:.1f}% ({total_successes}/{total_runs})")
     for example_name in examples:
         success_rate = summary["example_success_rates"][example_name] * 100
         thread_safe_print(f"Example '{example_name}': {success_rate:.1f}% success")
     return eval_dir
 def main():
     parser = argparse.ArgumentParser(description="Evaluate computer agent on examples")
-    parser.add_argument("--num-runs", type=int, default=3, help="Number of runs per example")
-    parser.add_argument("--output-dir", type=str, default="./eval_results", help="Output directory for evaluation results")
-    parser.add_argument("--max-parallel", type=int, default=2, help="Maximum number of examples to run in parallel")
-    parser.add_argument("--max-steps", type=int, default=200, help="Maximum number of steps in each run")
     args = parser.parse_args()
     # Examples from the original code
     examples = {
         "puppies": "Find me pictures of cute puppies",
@@ -293,12 +352,15 @@ def main():
         "flux": "Go on the Hugging Face Hub, find a Space for FLUX1.dev, and generate a picture of the Golden Gate bridge.",
         "hf": "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
     }
     # Create output directory if it doesn't exist
     os.makedirs(args.output_dir, exist_ok=True)
     # Run the evaluation
-    run_evaluation(examples, args.num_runs, args.output_dir, args.max_parallel, args.max_steps)
 if __name__ == "__main__":
-    main()

 import os
 import json
 import argparse
 import subprocess
 import threading
 import concurrent.futures
 from datetime import datetime
 from e2b_desktop import Sandbox
 from huggingface_hub import get_token
+from io import BytesIO
+from PIL import Image
 from e2bqwen import QwenVLAPIModel, E2BVisionAgent
 from dotenv import load_dotenv
     if not HUGGINGFACE_API_KEY:
         HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
         if not HUGGINGFACE_API_KEY:
+            raise ValueError(
+                "No Hugging Face token found. Please login with `huggingface-cli login` or set HUGGINGFACE_API_KEY environment variable"
+            )
 except ImportError:
     # Fall back if huggingface_hub is old version without get_token
     HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
 # Thread lock for print statements to avoid garbled output
 print_lock = threading.Lock()
 def thread_safe_print(*args, **kwargs):
     """Thread-safe print function"""
     with print_lock:
         print(*args, **kwargs)
 # Get git hash for folder naming
 def get_git_hash():
     try:
+        result = subprocess.run(
+            ["git", "rev-parse", "--short", "HEAD"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+        )
         if result.returncode == 0:
             return result.stdout.strip()
         return "nogit"
     except:
         return "nogit"
 def create_agent(data_dir, desktop, max_steps: int):
     """Create an agent with the E2B desktop sandbox"""
     model = QwenVLAPIModel(
         planning_interval=10,
     )
 def get_agent_summary_erase_images(agent):
     """Get agent summary and erase images to save space"""
     for memory_step in agent.memory.steps:
             memory_step.observations_images = None
     return agent.memory.get_succinct_steps()
 def chat_message_to_json(obj):
     """Custom JSON serializer for ChatMessage and related objects"""
+    if hasattr(obj, "__dict__"):
         # Create a copy of the object's __dict__ to avoid modifying the original
         result = obj.__dict__.copy()
         # Remove the 'raw' field which may contain non-serializable data
+        if "raw" in result:
+            del result["raw"]
         # Process the content or tool_calls if they exist
+        if "content" in result and result["content"] is not None:
+            if hasattr(result["content"], "__dict__"):
+                result["content"] = chat_message_to_json(result["content"])
+        if "tool_calls" in result and result["tool_calls"] is not None:
+            result["tool_calls"] = [
+                chat_message_to_json(tc) for tc in result["tool_calls"]
+            ]
         return result
     elif isinstance(obj, (list, tuple)):
         return [chat_message_to_json(item) for item in obj]
     else:
         return obj
 def save_final_status(folder, status: str, summary, error_message=None) -> None:
     """Save metadata about the run"""
     metadata_path = os.path.join(folder, "metadata.json")
     with open(metadata_path, "w") as output_file:
+        output_file.write(
+            json.dumps(
+                {"status": status, "summary": summary, "error_message": error_message},
+                default=chat_message_to_json,
+            )
+        )
 def run_example_once(example_name, example_text, run_index, example_dir, max_steps):
     """Run a single example once and return the result"""
     run_dir = os.path.join(example_dir, f"run_{run_index}")
     os.makedirs(run_dir, exist_ok=True)
     # Save the example text
     with open(os.path.join(run_dir, "task.txt"), "w") as f:
         f.write(example_text)
     thread_safe_print(f"  Starting run {run_index} for example '{example_name}'")
     # Create a new sandbox for this run
     desktop = None
     try:
         desktop = Sandbox(
+            api_key=E2B_API_KEY,
+            resolution=(WIDTH, HEIGHT),
+            dpi=96,
+            timeout=SANDBOX_TIMEOUT,
+            template="k0wmnzir0zuzye6dndlw",
         )
         # Initialize the desktop environment
         setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
         desktop.commands.run(setup_cmd)
         # Create and run the agent
         agent = create_agent(data_dir=run_dir, desktop=desktop, max_steps=max_steps)
+        screenshot_bytes = desktop.screenshot(format="bytes")
+        initial_screenshot = Image.open(BytesIO(screenshot_bytes))
         try:
+            agent.run(task=example_text, images=[initial_screenshot])
             summary = get_agent_summary_erase_images(agent)
             save_final_status(run_dir, "completed", summary=summary)
+            thread_safe_print(
+                f"  ✓ Example '{example_name}' run {run_index} completed successfully"
+            )
             result = {"status": "completed", "run_dir": run_dir}
         except Exception as e:
             error_message = f"Error in agent execution: {str(e)}"
+            thread_safe_print(
+                f"  ✗ Example '{example_name}' run {run_index} failed: {error_message}"
+            )
+            summary = (
+                get_agent_summary_erase_images(agent)
+                if hasattr(agent, "memory")
+                else None
+            )
+            save_final_status(
+                run_dir, "failed", summary=summary, error_message=error_message
+            )
             result = {"status": "failed", "run_dir": run_dir, "error": error_message}
     except Exception as e:
         error_message = f"Error setting up sandbox: {str(e)}"
+        thread_safe_print(
+            f"  ✗ Example '{example_name}' run {run_index} failed: {error_message}"
+        )
         save_final_status(run_dir, "failed", summary=None, error_message=error_message)
         result = {"status": "failed", "run_dir": run_dir, "error": error_message}
     finally:
                 desktop.kill()
             except:
                 pass
     return result
 def run_example(example_name, example_text, num_runs, example_dir, max_steps):
     """Run a single example multiple times using threads for each run"""
     thread_safe_print(f"\nRunning example '{example_name}': '{example_text[:50]}...'")
     results = []
     with concurrent.futures.ThreadPoolExecutor(max_workers=num_runs) as executor:
         # Submit all runs to the executor
         future_to_run = {
+            executor.submit(
+                run_example_once, example_name, example_text, j, example_dir, max_steps
+            ): j
             for j in range(num_runs)
         }
         # Collect results as they complete
         for future in concurrent.futures.as_completed(future_to_run):
             run_index = future_to_run[future]
                 result = future.result()
                 results.append(result)
             except Exception as exc:
+                thread_safe_print(
+                    f"  ✗ Run {run_index} for '{example_name}' generated an exception: {exc}"
+                )
+                results.append(
+                    {"status": "error", "run_index": run_index, "error": str(exc)}
+                )
     return results
 def run_evaluation(examples, num_runs, output_dir, max_parallel, max_steps):
     """Run each example n times and save the results"""
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     git_hash = get_git_hash()
     eval_dir = os.path.join(output_dir, f"eval_{timestamp}_{git_hash}")
     os.makedirs(eval_dir, exist_ok=True)
     thread_safe_print(f"Starting evaluation. Results will be saved to: {eval_dir}")
+    thread_safe_print(
+        f"Will run {len(examples)} examples, {num_runs} times each, with {max_parallel} parallel examples"
+    )
     # Save examples to the evaluation directory
     with open(os.path.join(eval_dir, "examples.json"), "w") as f:
         json.dump(examples, f, indent=2)
     all_results = {}
     # Run examples in parallel, but limit the number of parallel examples
     with concurrent.futures.ThreadPoolExecutor(max_workers=max_parallel) as executor:
         # Prepare the example directories first
             example_dir = os.path.join(eval_dir, f"example_{example_name}")
             os.makedirs(example_dir, exist_ok=True)
             example_dirs[example_name] = example_dir
         # Submit all examples to the executor
         future_to_example = {
+            executor.submit(
+                run_example,
+                example_name,
+                example_text,
+                num_runs,
+                example_dirs[example_name],
+                max_steps,
+            ): example_name
             for example_name, example_text in examples.items()
         }
         # Collect results as they complete
         for future in concurrent.futures.as_completed(future_to_example):
             example_name = future_to_example[future]
             try:
                 results = future.result()
                 all_results[example_name] = results
                 # Calculate success rate for this example
                 success_count = sum(1 for r in results if r["status"] == "completed")
+                thread_safe_print(
+                    f"Example '{example_name}' complete: {success_count}/{num_runs} successful runs ({success_count / num_runs * 100:.1f}%)"
+                )
             except Exception as exc:
+                thread_safe_print(
+                    f"Example '{example_name}' generated an exception: {exc}"
+                )
                 all_results[example_name] = [{"status": "error", "error": str(exc)}]
     # Calculate overall results and success rates
     success_counts = {
         example_name: sum(1 for r in results if r["status"] == "completed")
         for example_name, results in all_results.items()
     }
     total_runs = sum(len(results) for results in all_results.values())
     total_successes = sum(success_counts.values())
     # Save summary to evaluation directory
     summary = {
         "total_runs": total_runs,
         "total_successes": total_successes,
         "success_rate": total_successes / total_runs if total_runs > 0 else 0,
         "example_success_rates": {
+            example_name: success_counts[example_name] / len(all_results[example_name])
             for example_name in examples
+        },
     }
     with open(os.path.join(eval_dir, "summary.json"), "w") as f:
         json.dump(summary, f, indent=2)
     thread_safe_print(f"\nEvaluation complete. Results saved to: {eval_dir}")
+    thread_safe_print(
+        f"Overall success rate: {summary['success_rate'] * 100:.1f}% ({total_successes}/{total_runs})"
+    )
     for example_name in examples:
         success_rate = summary["example_success_rates"][example_name] * 100
         thread_safe_print(f"Example '{example_name}': {success_rate:.1f}% success")
     return eval_dir
 def main():
     parser = argparse.ArgumentParser(description="Evaluate computer agent on examples")
+    parser.add_argument(
+        "--num-runs", type=int, default=3, help="Number of runs per example"
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="./eval_results",
+        help="Output directory for evaluation results",
+    )
+    parser.add_argument(
+        "--max-parallel",
+        type=int,
+        default=2,
+        help="Maximum number of examples to run in parallel",
+    )
+    parser.add_argument(
+        "--max-steps", type=int, default=200, help="Maximum number of steps in each run"
+    )
     args = parser.parse_args()
     # Examples from the original code
     examples = {
         "puppies": "Find me pictures of cute puppies",
         "flux": "Go on the Hugging Face Hub, find a Space for FLUX1.dev, and generate a picture of the Golden Gate bridge.",
         "hf": "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
     }
     # Create output directory if it doesn't exist
     os.makedirs(args.output_dir, exist_ok=True)
     # Run the evaluation
+    run_evaluation(
+        examples, args.num_runs, args.output_dir, args.max_parallel, args.max_steps
+    )
 if __name__ == "__main__":
+    main()

model_replay.py CHANGED Viewed

@@ -7,43 +7,39 @@ import json
 class FakeModelReplayLog(Model):
     """A model class that returns pre-recorded responses from a log file.
     This class is useful for testing and debugging purposes, as it doesn't make
     actual API calls but instead returns responses from a pre-recorded log file.
     Parameters:
-        log_url (str, optional):
             URL to the log file. Defaults to the smolagents example log.
         **kwargs: Additional keyword arguments passed to the Model base class.
     """
-    def __init__(
-        self,
-        log_folder: str,
-        **kwargs
-    ):
         super().__init__(**kwargs)
         self.dataset_name = "smolagents/computer-agent-logs"
         self.log_folder = log_folder
         self.call_counter = 0
         self.model_outputs = self._load_model_outputs()
     def _load_model_outputs(self) -> List[str]:
         """Load model outputs from the log file using HuggingFace datasets library."""
         # Download the file from Hugging Face Hub
         file_path = hf_hub_download(
             repo_id=self.dataset_name,
             filename=self.log_folder + "/metadata.json",
-            repo_type="dataset"
         )
         # Load and parse the JSON data
-        with open(file_path, 'r') as f:
             log_data = json.load(f)
         # Extract only the model_output from each step in tool_calls
         model_outputs = []
         for step in log_data["summary"][1:]:
             model_outputs.append(step["model_output_message"]["content"])
@@ -56,17 +52,17 @@ class FakeModelReplayLog(Model):
         stop_sequences: Optional[List[str]] = None,
         grammar: Optional[str] = None,
         tools_to_call_from: Optional[List[Tool]] = None,
-        **kwargs
     ) -> ChatMessage:
         """Return the next pre-recorded response from the log file.
         Parameters:
             messages: List of input messages (ignored).
             stop_sequences: Optional list of stop sequences (ignored).
             grammar: Optional grammar specification (ignored).
             tools_to_call_from: Optional list of tools (ignored).
             **kwargs: Additional keyword arguments (ignored).
         Returns:
             ChatMessage: The next pre-recorded response.
         """
@@ -82,12 +78,11 @@ class FakeModelReplayLog(Model):
         # Token counts are simulated
         self.last_input_token_count = len(str(messages)) // 4  # Rough approximation
         self.last_output_token_count = len(content) // 4  # Rough approximation
         # Create and return a ChatMessage
         return ChatMessage(
             role=MessageRole.ASSISTANT,
             content=content,
             tool_calls=None,
-            raw={"source": "pre-recorded log", "call_number": self.call_counter}
         )

 class FakeModelReplayLog(Model):
     """A model class that returns pre-recorded responses from a log file.
     This class is useful for testing and debugging purposes, as it doesn't make
     actual API calls but instead returns responses from a pre-recorded log file.
     Parameters:
+        log_url (str, optional):
             URL to the log file. Defaults to the smolagents example log.
         **kwargs: Additional keyword arguments passed to the Model base class.
     """
+    def __init__(self, log_folder: str, **kwargs):
         super().__init__(**kwargs)
         self.dataset_name = "smolagents/computer-agent-logs"
         self.log_folder = log_folder
         self.call_counter = 0
         self.model_outputs = self._load_model_outputs()
     def _load_model_outputs(self) -> List[str]:
         """Load model outputs from the log file using HuggingFace datasets library."""
         # Download the file from Hugging Face Hub
         file_path = hf_hub_download(
             repo_id=self.dataset_name,
             filename=self.log_folder + "/metadata.json",
+            repo_type="dataset",
         )
         # Load and parse the JSON data
+        with open(file_path, "r") as f:
             log_data = json.load(f)
         # Extract only the model_output from each step in tool_calls
         model_outputs = []
         for step in log_data["summary"][1:]:
             model_outputs.append(step["model_output_message"]["content"])
         stop_sequences: Optional[List[str]] = None,
         grammar: Optional[str] = None,
         tools_to_call_from: Optional[List[Tool]] = None,
+        **kwargs,
     ) -> ChatMessage:
         """Return the next pre-recorded response from the log file.
         Parameters:
             messages: List of input messages (ignored).
             stop_sequences: Optional list of stop sequences (ignored).
             grammar: Optional grammar specification (ignored).
             tools_to_call_from: Optional list of tools (ignored).
             **kwargs: Additional keyword arguments (ignored).
         Returns:
             ChatMessage: The next pre-recorded response.
         """
         # Token counts are simulated
         self.last_input_token_count = len(str(messages)) // 4  # Rough approximation
         self.last_output_token_count = len(content) // 4  # Rough approximation
         # Create and return a ChatMessage
         return ChatMessage(
             role=MessageRole.ASSISTANT,
             content=content,
             tool_calls=None,
+            raw={"source": "pre-recorded log", "call_number": self.call_counter},
         )

show_eval.py CHANGED Viewed

@@ -8,136 +8,153 @@ from flask_cors import CORS
 app = Flask(__name__)
 CORS(app)  # Enable CORS for all routes
 # Serve the HTML viewer
-@app.route('/')
 def index():
-    return render_template('viewer.html')
 # Get list of available evaluations
-@app.route('/api/evals')
 def list_evals():
-    base_dir = request.args.get('path', './eval_results')
     if not os.path.exists(base_dir):
         return jsonify({"error": f"Path {base_dir} does not exist"}), 404
     eval_dirs = []
     for item in os.listdir(base_dir):
         full_path = os.path.join(base_dir, item)
-        if os.path.isdir(full_path) and item.startswith('eval_'):
             eval_dirs.append(item)
     return jsonify(eval_dirs)
 # Get examples for an evaluation
-@app.route('/api/eval/<eval_id>/examples')
 def get_examples(eval_id):
-    base_dir = request.args.get('path', './eval_results')
     eval_path = os.path.join(base_dir, eval_id)
     # Try to read examples.json
-    examples_json_path = os.path.join(eval_path, 'examples.json')
     examples = {}
     if os.path.exists(examples_json_path):
         try:
-            with open(examples_json_path, 'r') as f:
                 examples = json.load(f)
         except json.JSONDecodeError:
             app.logger.error(f"Error parsing examples.json at {examples_json_path}")
     # If examples.json doesn't exist or is empty, scan for example directories
     if not examples:
         for item in os.listdir(eval_path):
-            if os.path.isdir(os.path.join(eval_path, item)) and item.startswith('example_'):
-                example_id = item.replace('example_', '')
                 example_dir = os.path.join(eval_path, item)
                 # Find the first run and read task.txt
                 run_dirs = []
                 for run_item in os.listdir(example_dir):
                     run_path = os.path.join(example_dir, run_item)
-                    if os.path.isdir(run_path) and run_item.startswith('run_'):
                         run_dirs.append(run_item)
                 if run_dirs:
-                    task_path = os.path.join(example_dir, run_dirs[0], 'task.txt')
                     if os.path.exists(task_path):
-                        with open(task_path, 'r') as f:
                             examples[example_id] = f.read().strip()
                     else:
                         # If no task.txt, try reading from metadata.json
-                        metadata_path = os.path.join(example_dir, run_dirs[0], 'metadata.json')
                         if os.path.exists(metadata_path):
                             try:
-                                with open(metadata_path, 'r') as f:
                                     metadata = json.load(f)
                                     # Look for task in summary[0].task
-                                    if 'summary' in metadata and metadata['summary'] and 'task' in metadata['summary'][0]:
-                                        examples[example_id] = metadata['summary'][0]['task']
                             except:
                                 # Default to directory name if all else fails
                                 examples[example_id] = f"Task for {example_id}"
                         else:
                             examples[example_id] = f"Task for {example_id}"
     return jsonify(examples)
 # Get runs for an example
-@app.route('/api/eval/<eval_id>/example/<example_id>/runs')
 def get_runs(eval_id, example_id):
-    base_dir = request.args.get('path', './eval_results')
-    example_dir = os.path.join(base_dir, eval_id, f'example_{example_id}')
     if not os.path.exists(example_dir):
         return jsonify({"error": f"Example directory not found: {example_dir}"}), 404
     runs = []
     for item in os.listdir(example_dir):
         item_path = os.path.join(example_dir, item)
-        if os.path.isdir(item_path) and item.startswith('run_'):
             run_id = item
             # Try to get status from metadata.json
-            metadata_path = os.path.join(item_path, 'metadata.json')
-            status = 'unknown'
             if os.path.exists(metadata_path):
                 try:
-                    with open(metadata_path, 'r') as f:
                         metadata = json.load(f)
-                        status = metadata.get('status', 'unknown')
                 except Exception as e:
-                    app.logger.error(f"Error reading metadata.json for {run_id}: {str(e)}")
-            runs.append({'id': run_id, 'status': status})
     app.logger.info(f"runs: {runs}")
     return jsonify(runs)
 # Get metadata for a run
-@app.route('/api/eval/<eval_id>/example/<example_id>/run/<run_id>/metadata')
 def get_metadata(eval_id, example_id, run_id):
-    base_dir = request.args.get('path', './eval_results')
-    run_dir = os.path.join(base_dir, eval_id, f'example_{example_id}', run_id)
-    metadata_path = os.path.join(run_dir, 'metadata.json')
     app.logger.info(f"metadata: {metadata_path}")
     if not os.path.exists(metadata_path):
         return jsonify({"error": "Metadata not found", "path": metadata_path}), 404
     try:
-        with open(metadata_path, 'r') as f:
             metadata_content = f.read()
             if not metadata_content.strip():
                 return jsonify({"error": "Metadata file is empty"}), 404
             metadata = json.loads(metadata_content)
             return jsonify(metadata)
     except json.JSONDecodeError as e:
         error_info = {
             "error": "Invalid JSON in metadata file",
             "details": str(e),
-            "path": metadata_path
         }
         app.logger.error(f"JSON error in {metadata_path}: {str(e)}")
         return jsonify(error_info), 400
@@ -146,54 +163,56 @@ def get_metadata(eval_id, example_id, run_id):
             "error": "Error reading metadata file",
             "details": str(e),
             "traceback": traceback.format_exc(),
-            "path": metadata_path
         }
         app.logger.error(f"Error reading {metadata_path}: {str(e)}")
         return jsonify(error_info), 500
 # Get screenshots for a run
-@app.route('/api/eval/<eval_id>/example/<example_id>/run/<run_id>/screenshots')
 def get_screenshots(eval_id, example_id, run_id):
-    base_dir = request.args.get('path', './eval_results')
-    run_dir = os.path.join(base_dir, eval_id, f'example_{example_id}', run_id)
     if not os.path.exists(run_dir):
         return jsonify({"error": f"Run directory not found: {run_dir}"}), 404
     screenshots = []
-    for ext in ['png', 'jpg', 'jpeg']:
-        pattern = os.path.join(run_dir, f'*.{ext}')
         for file_path in glob.glob(pattern):
             filename = os.path.basename(file_path)
-            screenshots.append({
-                'name': filename,
-                'path': f'/api/image?path={file_path}'
-            })
     # Sort by filename
-    screenshots.sort(key=lambda x: x['name'])
     app.logger.info(f"screenshots: {screenshots}")
     return jsonify(screenshots)
 # Serve an image file
-@app.route('/api/image')
 def get_image():
-    path = request.args.get('path')
     if not path:
         return jsonify({"error": "No path provided"}), 400
     if not os.path.exists(path):
         return jsonify({"error": f"Image not found at path: {path}"}), 404
     try:
         return send_file(path)
     except Exception as e:
         return jsonify({"error": f"Error serving image: {str(e)}"}), 500
-if __name__ == '__main__':
     print("Evaluation Server is running at http://localhost:8000")
     print("Press Ctrl+C to stop the server")
-    app.run(debug=True, port=8000)

 app = Flask(__name__)
 CORS(app)  # Enable CORS for all routes
 # Serve the HTML viewer
+@app.route("/")
 def index():
+    return render_template("viewer.html")
 # Get list of available evaluations
+@app.route("/api/evals")
 def list_evals():
+    base_dir = request.args.get("path", "./eval_results")
     if not os.path.exists(base_dir):
         return jsonify({"error": f"Path {base_dir} does not exist"}), 404
     eval_dirs = []
     for item in os.listdir(base_dir):
         full_path = os.path.join(base_dir, item)
+        if os.path.isdir(full_path) and item.startswith("eval_"):
             eval_dirs.append(item)
     return jsonify(eval_dirs)
 # Get examples for an evaluation
+@app.route("/api/eval/<eval_id>/examples")
 def get_examples(eval_id):
+    base_dir = request.args.get("path", "./eval_results")
     eval_path = os.path.join(base_dir, eval_id)
     # Try to read examples.json
+    examples_json_path = os.path.join(eval_path, "examples.json")
     examples = {}
     if os.path.exists(examples_json_path):
         try:
+            with open(examples_json_path, "r") as f:
                 examples = json.load(f)
         except json.JSONDecodeError:
             app.logger.error(f"Error parsing examples.json at {examples_json_path}")
     # If examples.json doesn't exist or is empty, scan for example directories
     if not examples:
         for item in os.listdir(eval_path):
+            if os.path.isdir(os.path.join(eval_path, item)) and item.startswith(
+                "example_"
+            ):
+                example_id = item.replace("example_", "")
                 example_dir = os.path.join(eval_path, item)
                 # Find the first run and read task.txt
                 run_dirs = []
                 for run_item in os.listdir(example_dir):
                     run_path = os.path.join(example_dir, run_item)
+                    if os.path.isdir(run_path) and run_item.startswith("run_"):
                         run_dirs.append(run_item)
                 if run_dirs:
+                    task_path = os.path.join(example_dir, run_dirs[0], "task.txt")
                     if os.path.exists(task_path):
+                        with open(task_path, "r") as f:
                             examples[example_id] = f.read().strip()
                     else:
                         # If no task.txt, try reading from metadata.json
+                        metadata_path = os.path.join(
+                            example_dir, run_dirs[0], "metadata.json"
+                        )
                         if os.path.exists(metadata_path):
                             try:
+                                with open(metadata_path, "r") as f:
                                     metadata = json.load(f)
                                     # Look for task in summary[0].task
+                                    if (
+                                        "summary" in metadata
+                                        and metadata["summary"]
+                                        and "task" in metadata["summary"][0]
+                                    ):
+                                        examples[example_id] = metadata["summary"][0][
+                                            "task"
+                                        ]
                             except:
                                 # Default to directory name if all else fails
                                 examples[example_id] = f"Task for {example_id}"
                         else:
                             examples[example_id] = f"Task for {example_id}"
     return jsonify(examples)
 # Get runs for an example
+@app.route("/api/eval/<eval_id>/example/<example_id>/runs")
 def get_runs(eval_id, example_id):
+    base_dir = request.args.get("path", "./eval_results")
+    example_dir = os.path.join(base_dir, eval_id, f"example_{example_id}")
     if not os.path.exists(example_dir):
         return jsonify({"error": f"Example directory not found: {example_dir}"}), 404
     runs = []
     for item in os.listdir(example_dir):
         item_path = os.path.join(example_dir, item)
+        if os.path.isdir(item_path) and item.startswith("run_"):
             run_id = item
             # Try to get status from metadata.json
+            metadata_path = os.path.join(item_path, "metadata.json")
+            status = "unknown"
             if os.path.exists(metadata_path):
                 try:
+                    with open(metadata_path, "r") as f:
                         metadata = json.load(f)
+                        status = metadata.get("status", "unknown")
                 except Exception as e:
+                    app.logger.error(
+                        f"Error reading metadata.json for {run_id}: {str(e)}"
+                    )
+            runs.append({"id": run_id, "status": status})
     app.logger.info(f"runs: {runs}")
     return jsonify(runs)
 # Get metadata for a run
+@app.route("/api/eval/<eval_id>/example/<example_id>/run/<run_id>/metadata")
 def get_metadata(eval_id, example_id, run_id):
+    base_dir = request.args.get("path", "./eval_results")
+    run_dir = os.path.join(base_dir, eval_id, f"example_{example_id}", run_id)
+    metadata_path = os.path.join(run_dir, "metadata.json")
     app.logger.info(f"metadata: {metadata_path}")
     if not os.path.exists(metadata_path):
         return jsonify({"error": "Metadata not found", "path": metadata_path}), 404
     try:
+        with open(metadata_path, "r") as f:
             metadata_content = f.read()
             if not metadata_content.strip():
                 return jsonify({"error": "Metadata file is empty"}), 404
             metadata = json.loads(metadata_content)
             return jsonify(metadata)
     except json.JSONDecodeError as e:
         error_info = {
             "error": "Invalid JSON in metadata file",
             "details": str(e),
+            "path": metadata_path,
         }
         app.logger.error(f"JSON error in {metadata_path}: {str(e)}")
         return jsonify(error_info), 400
             "error": "Error reading metadata file",
             "details": str(e),
             "traceback": traceback.format_exc(),
+            "path": metadata_path,
         }
         app.logger.error(f"Error reading {metadata_path}: {str(e)}")
         return jsonify(error_info), 500
 # Get screenshots for a run
+@app.route("/api/eval/<eval_id>/example/<example_id>/run/<run_id>/screenshots")
 def get_screenshots(eval_id, example_id, run_id):
+    base_dir = request.args.get("path", "./eval_results")
+    run_dir = os.path.join(base_dir, eval_id, f"example_{example_id}", run_id)
     if not os.path.exists(run_dir):
         return jsonify({"error": f"Run directory not found: {run_dir}"}), 404
     screenshots = []
+    for ext in ["png", "jpg", "jpeg"]:
+        pattern = os.path.join(run_dir, f"*.{ext}")
         for file_path in glob.glob(pattern):
             filename = os.path.basename(file_path)
+            screenshots.append(
+                {"name": filename, "path": f"/api/image?path={file_path}"}
+            )
     # Sort by filename
+    screenshots.sort(key=lambda x: x["name"])
     app.logger.info(f"screenshots: {screenshots}")
     return jsonify(screenshots)
 # Serve an image file
+@app.route("/api/image")
 def get_image():
+    path = request.args.get("path")
     if not path:
         return jsonify({"error": "No path provided"}), 400
     if not os.path.exists(path):
         return jsonify({"error": f"Image not found at path: {path}"}), 404
     try:
         return send_file(path)
     except Exception as e:
         return jsonify({"error": f"Error serving image: {str(e)}"}), 500
+if __name__ == "__main__":
     print("Evaluation Server is running at http://localhost:8000")
     print("Press Ctrl+C to stop the server")
+    app.run(debug=True, port=8000)