Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 13,138 Bytes
c3fdac2 8770774 ee08a04 c3fdac2 5cc8c9a 64b82de c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 5cc8c9a c3fdac2 5cc8c9a c3fdac2 5cc8c9a c3fdac2 528c15b c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 5cc8c9a c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 5cc8c9a 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 ee08a04 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 ee08a04 8770774 5cc8c9a c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 ee08a04 8770774 ee08a04 8770774 c3fdac2 8770774 5cc8c9a c3fdac2 8770774 e73b85c c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 e73b85c c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 ee08a04 528c15b ee08a04 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 c3fdac2 8770774 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 |
import os
import json
import argparse
import subprocess
import threading
import concurrent.futures
from datetime import datetime
from e2b_desktop import Sandbox
from huggingface_hub import get_token
from io import BytesIO
from PIL import Image
from e2bqwen import QwenVLAPIModel, E2BVisionAgent, get_agent_summary_erase_images
from dotenv import load_dotenv
load_dotenv(override=True)
# Environment variables and constants
E2B_API_KEY = os.getenv("E2B_API_KEY")
# Try to get token dynamically, fall back to environment variable
try:
HUGGINGFACE_API_KEY = get_token()
if not HUGGINGFACE_API_KEY:
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
if not HUGGINGFACE_API_KEY:
raise ValueError(
"No Hugging Face token found. Please login with `huggingface-cli login` or set HUGGINGFACE_API_KEY environment variable"
)
except ImportError:
# Fall back if huggingface_hub is old version without get_token
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
WIDTH = 1024
HEIGHT = 768
SANDBOX_TIMEOUT = 600 # 10 minutes
# Thread lock for print statements to avoid garbled output
print_lock = threading.Lock()
def thread_safe_print(*args, **kwargs):
"""Thread-safe print function"""
with print_lock:
print(*args, **kwargs)
# Get git hash for folder naming
def get_git_hash():
try:
result = subprocess.run(
["git", "rev-parse", "--short", "HEAD"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
if result.returncode == 0:
return result.stdout.strip()
return "nogit"
except:
return "nogit"
def create_agent(data_dir, desktop, max_steps: int):
"""Create an agent with the E2B desktop sandbox"""
model = QwenVLAPIModel(
model_id="Qwen/Qwen2.5-VL-72B-Instruct",
hf_token=HUGGINGFACE_API_KEY,
)
# model = OpenAIServerModel(
# model_id="gpt-4o",
# api_key=os.getenv("OPENAI_API_KEY")
# )
return E2BVisionAgent(
model=model,
data_dir=data_dir,
desktop=desktop,
max_steps=max_steps,
verbosity_level=2,
# planning_interval=10,
)
def chat_message_to_json(obj):
"""Custom JSON serializer for ChatMessage and related objects"""
if hasattr(obj, "__dict__"):
# Create a copy of the object's __dict__ to avoid modifying the original
result = obj.__dict__.copy()
# Remove the 'raw' field which may contain non-serializable data
if "raw" in result:
del result["raw"]
# Process the content or tool_calls if they exist
if "content" in result and result["content"] is not None:
if hasattr(result["content"], "__dict__"):
result["content"] = chat_message_to_json(result["content"])
if "tool_calls" in result and result["tool_calls"] is not None:
result["tool_calls"] = [
chat_message_to_json(tc) for tc in result["tool_calls"]
]
return result
elif isinstance(obj, (list, tuple)):
return [chat_message_to_json(item) for item in obj]
else:
return obj
def save_final_status(folder, status: str, summary, error_message=None) -> None:
"""Save metadata about the run"""
metadata_path = os.path.join(folder, "metadata.json")
with open(metadata_path, "w") as output_file:
output_file.write(
json.dumps(
{"status": status, "summary": summary, "error_message": error_message},
default=chat_message_to_json,
)
)
def run_example_once(example_name, example_text, run_index, example_dir, max_steps):
"""Run a single example once and return the result"""
run_dir = os.path.join(example_dir, f"run_{run_index}")
os.makedirs(run_dir, exist_ok=True)
# Save the example text
with open(os.path.join(run_dir, "task.txt"), "w") as f:
f.write(example_text)
thread_safe_print(f" Starting run {run_index} for example '{example_name}'")
# Create a new sandbox for this run
desktop = None
try:
desktop = Sandbox(
api_key=E2B_API_KEY,
resolution=(WIDTH, HEIGHT),
dpi=96,
timeout=SANDBOX_TIMEOUT,
template="k0wmnzir0zuzye6dndlw",
)
# Initialize the desktop environment
setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
desktop.commands.run(setup_cmd)
# Create and run the agent
agent = create_agent(data_dir=run_dir, desktop=desktop, max_steps=max_steps)
screenshot_bytes = desktop.screenshot(format="bytes")
initial_screenshot = Image.open(BytesIO(screenshot_bytes))
try:
agent.run(task=example_text, images=[initial_screenshot])
summary = get_agent_summary_erase_images(agent)
save_final_status(run_dir, "completed", summary=summary)
thread_safe_print(
f" ✓ Example '{example_name}' run {run_index} completed successfully"
)
result = {"status": "completed", "run_dir": run_dir}
except Exception as e:
error_message = f"Error in agent execution: {str(e)}"
thread_safe_print(
f" ✗ Example '{example_name}' run {run_index} failed: {error_message}"
)
summary = (
get_agent_summary_erase_images(agent)
if hasattr(agent, "memory")
else None
)
save_final_status(
run_dir, "failed", summary=summary, error_message=error_message
)
result = {"status": "failed", "run_dir": run_dir, "error": error_message}
except Exception as e:
raise e
error_message = f"Error setting up sandbox: {str(e)}"
thread_safe_print(
f" ✗ Example '{example_name}' run {run_index} failed: {error_message}"
)
save_final_status(run_dir, "failed", summary=None, error_message=error_message)
result = {"status": "failed", "run_dir": run_dir, "error": error_message}
finally:
# Always clean up the sandbox
if desktop:
try:
desktop.kill()
except:
pass
return result
import traceback
def run_example(example_name, example_text, num_runs, example_dir, max_steps):
"""Run a single example multiple times using threads for each run"""
thread_safe_print(f"\nRunning example '{example_name}': '{example_text[:50]}...'")
results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=num_runs) as executor:
# Submit all runs to the executor
future_to_run = {
executor.submit(
run_example_once, example_name, example_text, j, example_dir, max_steps
): j
for j in range(num_runs)
}
# Collect results as they complete
for future in concurrent.futures.as_completed(future_to_run):
run_index = future_to_run[future]
try:
result = future.result()
results.append(result)
except Exception as exc:
error_traceback = traceback.format_exc()
thread_safe_print(
f" ✗ Run {run_index} for '{example_name}' generated an exception:\n{error_traceback}"
)
results.append(
{"status": "error", "run_index": run_index, "error": str(exc)}
)
return results
def run_evaluation(examples, num_runs, output_dir, max_parallel, max_steps):
"""Run each example n times and save the results"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
git_hash = get_git_hash()
eval_dir = os.path.join(output_dir, f"eval_{timestamp}_{git_hash}")
os.makedirs(eval_dir, exist_ok=True)
start_time = datetime.now()
thread_safe_print(f"Starting evaluation. Results will be saved to: {eval_dir}")
thread_safe_print(
f"Will run {len(examples)} examples, {num_runs} times each, with {max_parallel} parallel examples"
)
# Save examples to the evaluation directory
with open(os.path.join(eval_dir, "examples.json"), "w") as f:
json.dump(examples, f, indent=2)
all_results = {}
# Run examples in parallel, but limit the number of parallel examples
with concurrent.futures.ThreadPoolExecutor(max_workers=max_parallel) as executor:
# Prepare the example directories first
example_dirs = {}
for example_name in examples:
example_dir = os.path.join(eval_dir, f"example_{example_name}")
os.makedirs(example_dir, exist_ok=True)
example_dirs[example_name] = example_dir
# Submit all examples to the executor
future_to_example = {
executor.submit(
run_example,
example_name,
example_text,
num_runs,
example_dirs[example_name],
max_steps,
): example_name
for example_name, example_text in examples.items()
}
# Collect results as they complete
for future in concurrent.futures.as_completed(future_to_example):
example_name = future_to_example[future]
try:
results = future.result()
all_results[example_name] = results
# Calculate success rate for this example
success_count = sum(1 for r in results if r["status"] == "completed")
thread_safe_print(
f"Example '{example_name}' complete: {success_count}/{num_runs} successful runs ({success_count / num_runs * 100:.1f}%)"
)
except Exception as exc:
thread_safe_print(
f"Example '{example_name}' generated an exception: {exc}"
)
all_results[example_name] = [{"status": "error", "error": str(exc)}]
# Calculate overall results and success rates
success_counts = {
example_name: sum(1 for r in results if r["status"] == "completed")
for example_name, results in all_results.items()
}
total_runs = sum(len(results) for results in all_results.values())
total_successes = sum(success_counts.values())
# Save summary to evaluation directory
summary = {
"total_runs": total_runs,
"total_successes": total_successes,
"success_rate": total_successes / total_runs if total_runs > 0 else 0,
"example_success_rates": {
example_name: success_counts[example_name] / len(all_results[example_name])
for example_name in examples
},
}
with open(os.path.join(eval_dir, "summary.json"), "w") as f:
json.dump(summary, f, indent=2)
thread_safe_print(f"\nEvaluation complete. Results saved to: {eval_dir}")
thread_safe_print(
f"Overall success rate: {summary['success_rate'] * 100:.1f}% ({total_successes}/{total_runs})"
)
for example_name in examples:
success_rate = summary["example_success_rates"][example_name] * 100
thread_safe_print(f"Example '{example_name}': {success_rate:.1f}% success")
print("Total duration:", datetime.now() - start_time)
return eval_dir
def main():
parser = argparse.ArgumentParser(description="Evaluate computer agent on examples")
parser.add_argument(
"--num-runs", type=int, default=3, help="Number of runs per example"
)
parser.add_argument(
"--output-dir",
type=str,
default="./eval_results",
help="Output directory for evaluation results",
)
parser.add_argument(
"--max-parallel",
type=int,
default=2,
help="Maximum number of examples to run in parallel",
)
parser.add_argument(
"--max-steps", type=int, default=200, help="Maximum number of steps in each run"
)
args = parser.parse_args()
# Examples from the original code
examples = {
"puppies": "Find me pictures of cute puppies",
"gmaps": "Use Google Maps to find the Hugging Face HQ in Paris",
"wiki": "Go to Wikipedia and find what happend on April 4th",
"commute": "Find out the travel time by train from Bern to Basel on Google Maps",
"hf_space": "Go to Hugging Face Spaces and then find the Space flux.1 schnell. Use the space to generate an image of a GPU",
}
# Create output directory if it doesn't exist
os.makedirs(args.output_dir, exist_ok=True)
# Run the evaluation
run_evaluation(
examples, args.num_runs, args.output_dir, args.max_parallel, args.max_steps
)
if __name__ == "__main__":
main()
|