computer-agent

Running on CPU Upgrade

App Files Files Community

lvwerra HF Staff commited on Apr 3

Commit

c3fdac2

1 Parent(s): 5504eb2

add eval script

Browse files

Files changed (1) hide show

eval.py +296 -0

eval.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import os
+import json
+import shutil
+import time
+import argparse
+import subprocess
+import traceback
+import threading
+import concurrent.futures
+from datetime import datetime
+from threading import Timer
+from e2b_desktop import Sandbox
+from huggingface_hub import get_token
+from smolagents import CodeAgent
+from smolagents.monitoring import LogLevel
+from e2bqwen import QwenVLAPIModel, E2BVisionAgent
+# Environment variables and constants
+E2B_API_KEY = os.getenv("E2B_API_KEY")
+# Try to get token dynamically, fall back to environment variable
+try:
+    HUGGINGFACE_API_KEY = get_token()
+    if not HUGGINGFACE_API_KEY:
+        HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
+        if not HUGGINGFACE_API_KEY:
+            raise ValueError("No Hugging Face token found. Please login with `huggingface-cli login` or set HUGGINGFACE_API_KEY environment variable")
+except ImportError:
+    # Fall back if huggingface_hub is old version without get_token
+    HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
+WIDTH = 1024
+HEIGHT = 768
+SANDBOX_TIMEOUT = 600  # 10 minutes
+# Thread lock for print statements to avoid garbled output
+print_lock = threading.Lock()
+def thread_safe_print(*args, **kwargs):
+    """Thread-safe print function"""
+    with print_lock:
+        print(*args, **kwargs)
+# Get git hash for folder naming
+def get_git_hash():
+    try:
+        result = subprocess.run(['git', 'rev-parse', '--short', 'HEAD'],
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE,
+                                text=True)
+        if result.returncode == 0:
+            return result.stdout.strip()
+        return "nogit"
+    except:
+        return "nogit"
+def create_agent(data_dir, desktop):
+    """Create an agent with the E2B desktop sandbox"""
+    model = QwenVLAPIModel(
+        model_id="Qwen/Qwen2.5-VL-72B-Instruct",
+        hf_token=HUGGINGFACE_API_KEY,
+    )
+    return E2BVisionAgent(
+        model=model,
+        data_dir=data_dir,
+        desktop=desktop,
+        max_steps=200,
+        verbosity_level=2,
+        planning_interval=10,
+    )
+def get_agent_summary_erase_images(agent):
+    """Get agent summary and erase images to save space"""
+    for memory_step in agent.memory.steps:
+        if getattr(memory_step, "observations_images", None):
+            memory_step.observations_images = None
+    return agent.memory.get_succinct_steps()
+def chat_message_to_json(obj):
+    """Custom JSON serializer for ChatMessage and related objects"""
+    if hasattr(obj, '__dict__'):
+        # Create a copy of the object's __dict__ to avoid modifying the original
+        result = obj.__dict__.copy()
+        # Remove the 'raw' field which may contain non-serializable data
+        if 'raw' in result:
+            del result['raw']
+        # Process the content or tool_calls if they exist
+        if 'content' in result and result['content'] is not None:
+            if hasattr(result['content'], '__dict__'):
+                result['content'] = chat_message_to_json(result['content'])
+        if 'tool_calls' in result and result['tool_calls'] is not None:
+            result['tool_calls'] = [chat_message_to_json(tc) for tc in result['tool_calls']]
+        return result
+    elif isinstance(obj, (list, tuple)):
+        return [chat_message_to_json(item) for item in obj]
+    else:
+        return obj
+def save_final_status(folder, status: str, summary, error_message=None) -> None:
+    """Save metadata about the run"""
+    metadata_path = os.path.join(folder, "metadata.json")
+    with open(metadata_path, "w") as output_file:
+        output_file.write(json.dumps({
+            "status": status,
+            "summary": summary,
+            "error_message": error_message
+        }, default=chat_message_to_json))
+def run_example_once(example_name, example_text, run_index, example_dir):
+    """Run a single example once and return the result"""
+    run_dir = os.path.join(example_dir, f"run_{run_index}")
+    os.makedirs(run_dir, exist_ok=True)
+    # Save the example text
+    with open(os.path.join(run_dir, "task.txt"), "w") as f:
+        f.write(example_text)
+    thread_safe_print(f"  Starting run {run_index} for example '{example_name}'")
+    # Create a new sandbox for this run
+    desktop = None
+    try:
+        desktop = Sandbox(
+            api_key=E2B_API_KEY,
+            resolution=(WIDTH, HEIGHT),
+            dpi=96,
+            timeout=SANDBOX_TIMEOUT
+        )
+        # Initialize the desktop environment
+        setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
+        desktop.commands.run(setup_cmd)
+        # Create and run the agent
+        agent = create_agent(data_dir=run_dir, desktop=desktop)
+        try:
+            agent.run(task=example_text)
+            summary = get_agent_summary_erase_images(agent)
+            save_final_status(run_dir, "completed", summary=summary)
+            thread_safe_print(f"  ✓ Example '{example_name}' run {run_index} completed successfully")
+            result = {"status": "completed", "run_dir": run_dir}
+        except Exception as e:
+            error_message = f"Error in agent execution: {str(e)}"
+            thread_safe_print(f"  ✗ Example '{example_name}' run {run_index} failed: {error_message}")
+            summary = get_agent_summary_erase_images(agent) if hasattr(agent, 'memory') else None
+            save_final_status(run_dir, "failed", summary=summary, error_message=error_message)
+            result = {"status": "failed", "run_dir": run_dir, "error": error_message}
+    except Exception as e:
+        error_message = f"Error setting up sandbox: {str(e)}"
+        thread_safe_print(f"  ✗ Example '{example_name}' run {run_index} failed: {error_message}")
+        save_final_status(run_dir, "failed", summary=None, error_message=error_message)
+        result = {"status": "failed", "run_dir": run_dir, "error": error_message}
+    finally:
+        # Always clean up the sandbox
+        if desktop:
+            try:
+                desktop.kill()
+            except:
+                pass
+    return result
+def run_example(example_name, example_text, num_runs, example_dir):
+    """Run a single example multiple times using threads for each run"""
+    thread_safe_print(f"\nRunning example '{example_name}': '{example_text[:50]}...'")
+    results = []
+    with concurrent.futures.ThreadPoolExecutor(max_workers=num_runs) as executor:
+        # Submit all runs to the executor
+        future_to_run = {
+            executor.submit(run_example_once, example_name, example_text, j, example_dir): j
+            for j in range(num_runs)
+        }
+        # Collect results as they complete
+        for future in concurrent.futures.as_completed(future_to_run):
+            run_index = future_to_run[future]
+            try:
+                result = future.result()
+                results.append(result)
+            except Exception as exc:
+                thread_safe_print(f"  ✗ Run {run_index} for '{example_name}' generated an exception: {exc}")
+                results.append({
+                    "status": "error",
+                    "run_index": run_index,
+                    "error": str(exc)
+                })
+    return results
+def run_evaluation(examples, num_runs, output_dir, max_parallel):
+    """Run each example n times and save the results"""
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    git_hash = get_git_hash()
+    eval_dir = os.path.join(output_dir, f"eval_{timestamp}_{git_hash}")
+    os.makedirs(eval_dir, exist_ok=True)
+    thread_safe_print(f"Starting evaluation. Results will be saved to: {eval_dir}")
+    thread_safe_print(f"Will run {len(examples)} examples, {num_runs} times each, with {max_parallel} parallel examples")
+    # Save examples to the evaluation directory
+    with open(os.path.join(eval_dir, "examples.json"), "w") as f:
+        json.dump(examples, f, indent=2)
+    all_results = {}
+    # Run examples in parallel, but limit the number of parallel examples
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_parallel) as executor:
+        # Prepare the example directories first
+        example_dirs = {}
+        for example_name in examples:
+            example_dir = os.path.join(eval_dir, f"example_{example_name}")
+            os.makedirs(example_dir, exist_ok=True)
+            example_dirs[example_name] = example_dir
+        # Submit all examples to the executor
+        future_to_example = {
+            executor.submit(run_example, example_name, example_text, num_runs, example_dirs[example_name]): example_name
+            for example_name, example_text in examples.items()
+        }
+        # Collect results as they complete
+        for future in concurrent.futures.as_completed(future_to_example):
+            example_name = future_to_example[future]
+            try:
+                results = future.result()
+                all_results[example_name] = results
+                # Calculate success rate for this example
+                success_count = sum(1 for r in results if r["status"] == "completed")
+                thread_safe_print(f"Example '{example_name}' complete: {success_count}/{num_runs} successful runs ({success_count/num_runs*100:.1f}%)")
+            except Exception as exc:
+                thread_safe_print(f"Example '{example_name}' generated an exception: {exc}")
+                all_results[example_name] = [{"status": "error", "error": str(exc)}]
+    # Calculate overall results and success rates
+    success_counts = {
+        example_name: sum(1 for r in results if r["status"] == "completed")
+        for example_name, results in all_results.items()
+    }
+    total_runs = sum(len(results) for results in all_results.values())
+    total_successes = sum(success_counts.values())
+    # Save summary to evaluation directory
+    summary = {
+        "total_runs": total_runs,
+        "total_successes": total_successes,
+        "success_rate": total_successes / total_runs if total_runs > 0 else 0,
+        "example_success_rates": {
+            example_name: success_counts[example_name] / len(all_results[example_name])
+            for example_name in examples
+        }
+    }
+    with open(os.path.join(eval_dir, "summary.json"), "w") as f:
+        json.dump(summary, f, indent=2)
+    thread_safe_print(f"\nEvaluation complete. Results saved to: {eval_dir}")
+    thread_safe_print(f"Overall success rate: {summary['success_rate']*100:.1f}% ({total_successes}/{total_runs})")
+    for example_name in examples:
+        success_rate = summary["example_success_rates"][example_name] * 100
+        thread_safe_print(f"Example '{example_name}': {success_rate:.1f}% success")
+    return eval_dir
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate computer agent on examples")
+    parser.add_argument("--num-runs", type=int, default=3, help="Number of runs per example")
+    parser.add_argument("--output-dir", type=str, default="./eval_results", help="Output directory for evaluation results")
+    parser.add_argument("--max-parallel", type=int, default=2, help="Maximum number of examples to run in parallel")
+    args = parser.parse_args()
+    # Examples from the original code
+    examples = {
+        "puppies": "Find me pictures of cute puppies",
+        "commute": "Check the commuting time between Bern and Zurich on Google maps",
+        "hello": "Write 'Hello World' in a text editor",
+        "wiki": "When was Temple Grandin introduced to the American Academy of Arts and Sciences, according to Wikipedia?",
+        "flight": "Search a flight Rome - Berlin for tomorrow",
+        "pond": "What's the name of the pond just south of Château de Fontainebleau in Google maps?",
+        "flux": "Go generate a picture of the Golden Gate bridge on a FLUX1.dev space",
+        "hf": "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
+    }
+    # Create output directory if it doesn't exist
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Run the evaluation
+    eval_dir = run_evaluation(examples, args.num_runs, args.output_dir, args.max_parallel)
+if __name__ == "__main__":
+    main()