m-ric HF Staff commited on
Commit
8772a92
·
1 Parent(s): 771c16d

Add last click marker on screenshots

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. e2bqwen.py +20 -7
app.py CHANGED
@@ -580,7 +580,7 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js, fill_width=True) as de
580
 
581
  gr.Examples(
582
  examples=[
583
- "Check the commuting time between Bern and Zurich",
584
  "Write 'Hello World' in a text editor",
585
  "Search a flight Paris - Berlin for tomorrow",
586
  "Could you head to Fontainebleau (France) in Google Maps then drag and drop to position the castle of Fontainebleau exactly in the center?",
 
580
 
581
  gr.Examples(
582
  examples=[
583
+ "Check the commuting time between Bern and Zurich on Google maps",
584
  "Write 'Hello World' in a text editor",
585
  "Search a flight Paris - Berlin for tomorrow",
586
  "Could you head to Fontainebleau (France) in Google Maps then drag and drop to position the castle of Fontainebleau exactly in the center?",
e2bqwen.py CHANGED
@@ -19,7 +19,7 @@ from smolagents.memory import ActionStep
19
  from smolagents.models import ChatMessage, MessageRole, Model
20
  from smolagents.monitoring import LogLevel
21
  from smolagents.agent_types import AgentImage
22
-
23
 
24
  E2B_SYSTEM_PROMPT_TEMPLATE = """You are a desktop automation assistant that can control a remote desktop environment.
25
  On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
@@ -35,7 +35,7 @@ IMPORTANT:
35
  - Remember the tools that you have as those can save you time, for example open_url to enter a website rather than searching for the browser in the OS.
36
  - Whenever you click, MAKE SURE to click in the middle of the button, text, link or any other clickable element. Not under, not on the side. IN THE MIDDLE. In menus it is always better to click in the middle of the text rather than in the tiny icon. Calculate extremelly well the coordinates. A mistake here can make the full task fail.
37
  - To navigate the desktop you should open menus and click. Menus usually expand with more options, the tiny triangle next to some text in a menu means that menu expands. For example in Office in the Applications menu expands showing presentation or writing applications.
38
- - Always analyze the latest screenshot carefully before performing actions. If you clicked somewhere in the previous action and in the screenshot nothing happened, make sure the mouse is where it should be. Otherwise you can see that the coordinates were wrong.
39
 
40
  You must proceed step by step:
41
  1. Understand the task thoroughly
@@ -81,7 +81,6 @@ final_answer("Done")
81
  ```<end_code>
82
 
83
  Remember to:
84
-
85
  Always wait for appropriate loading times
86
  Use precise coordinates based on the current screenshot
87
  Execute one action at a time
@@ -149,6 +148,7 @@ class E2BVisionAgent(CodeAgent):
149
  self.desktop.move_mouse(x, y)
150
  self.desktop.left_click()
151
  self.logger.log(f"Clicked at coordinates ({x}, {y})")
 
152
  return f"Clicked at coordinates ({x}, {y})"
153
 
154
  @tool
@@ -162,6 +162,7 @@ class E2BVisionAgent(CodeAgent):
162
  self.desktop.move_mouse(x, y)
163
  self.desktop.right_click()
164
  self.logger.log(f"Right-clicked at coordinates ({x}, {y})")
 
165
  return f"Right-clicked at coordinates ({x}, {y})"
166
 
167
  @tool
@@ -175,6 +176,7 @@ class E2BVisionAgent(CodeAgent):
175
  self.desktop.move_mouse(x, y)
176
  self.desktop.double_click()
177
  self.logger.log(f"Double-clicked at coordinates ({x}, {y})")
 
178
  return f"Double-clicked at coordinates ({x}, {y})"
179
 
180
  @tool
@@ -204,12 +206,10 @@ class E2BVisionAgent(CodeAgent):
204
  @tool
205
  def press_key(key: str) -> str:
206
  """
207
- Presses a keyboard key (e.g., "Return", "tab", "ctrl+c")
208
  Args:
209
- key: The key to press (e.g., "Return", "tab", "ctrl+c")
210
  """
211
- if key == "enter":
212
- key = "Return"
213
  self.desktop.press(key)
214
  self.logger.log(f"Pressed key: {key}")
215
  return f"Pressed key: {key}"
@@ -304,6 +304,17 @@ class E2BVisionAgent(CodeAgent):
304
  screenshot_bytes = self.desktop.screenshot()
305
  image = Image.open(BytesIO(screenshot_bytes))
306
 
 
 
 
 
 
 
 
 
 
 
 
307
  # Create a filename with step number
308
  screenshot_path = os.path.join(self.data_dir, f"step_{current_step:03d}.png")
309
  image.save(screenshot_path)
@@ -324,6 +335,8 @@ class E2BVisionAgent(CodeAgent):
324
 
325
  # memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
326
 
 
 
327
 
328
  def close(self):
329
  """Clean up resources"""
 
19
  from smolagents.models import ChatMessage, MessageRole, Model
20
  from smolagents.monitoring import LogLevel
21
  from smolagents.agent_types import AgentImage
22
+ from PIL import ImageDraw
23
 
24
  E2B_SYSTEM_PROMPT_TEMPLATE = """You are a desktop automation assistant that can control a remote desktop environment.
25
  On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
 
35
  - Remember the tools that you have as those can save you time, for example open_url to enter a website rather than searching for the browser in the OS.
36
  - Whenever you click, MAKE SURE to click in the middle of the button, text, link or any other clickable element. Not under, not on the side. IN THE MIDDLE. In menus it is always better to click in the middle of the text rather than in the tiny icon. Calculate extremelly well the coordinates. A mistake here can make the full task fail.
37
  - To navigate the desktop you should open menus and click. Menus usually expand with more options, the tiny triangle next to some text in a menu means that menu expands. For example in Office in the Applications menu expands showing presentation or writing applications.
38
+ - Always analyze the latest screenshot carefully before performing actions. If you clicked somewhere in the previous action, a red crosshair will appear at the exact click location: if nothing happened, check that this location is exactly where you intended to click. Otherwise correct the click coordinates.
39
 
40
  You must proceed step by step:
41
  1. Understand the task thoroughly
 
81
  ```<end_code>
82
 
83
  Remember to:
 
84
  Always wait for appropriate loading times
85
  Use precise coordinates based on the current screenshot
86
  Execute one action at a time
 
148
  self.desktop.move_mouse(x, y)
149
  self.desktop.left_click()
150
  self.logger.log(f"Clicked at coordinates ({x}, {y})")
151
+ self.click_coordinates = [x, y]
152
  return f"Clicked at coordinates ({x}, {y})"
153
 
154
  @tool
 
162
  self.desktop.move_mouse(x, y)
163
  self.desktop.right_click()
164
  self.logger.log(f"Right-clicked at coordinates ({x}, {y})")
165
+ self.click_coordinates = [x, y]
166
  return f"Right-clicked at coordinates ({x}, {y})"
167
 
168
  @tool
 
176
  self.desktop.move_mouse(x, y)
177
  self.desktop.double_click()
178
  self.logger.log(f"Double-clicked at coordinates ({x}, {y})")
179
+ self.click_coordinates = [x, y]
180
  return f"Double-clicked at coordinates ({x}, {y})"
181
 
182
  @tool
 
206
  @tool
207
  def press_key(key: str) -> str:
208
  """
209
+ Presses a keyboard key
210
  Args:
211
+ key: The key to press (e.g. "enter", "space", "backspace", etc.).
212
  """
 
 
213
  self.desktop.press(key)
214
  self.logger.log(f"Pressed key: {key}")
215
  return f"Pressed key: {key}"
 
304
  screenshot_bytes = self.desktop.screenshot()
305
  image = Image.open(BytesIO(screenshot_bytes))
306
 
307
+ if getattr(self, "click_coordinates", None):
308
+ # If a click was performed in the last action, mark it on the image
309
+ x, y = self.click_coordinates
310
+ draw = ImageDraw.Draw(image)
311
+ cross_size, linewidth = 10, 3
312
+ # Draw red cross lines
313
+ draw.line((x - cross_size, y, x + cross_size, y), fill="red", width=linewidth)
314
+ draw.line((x, y - cross_size, x, y + cross_size), fill="red", width=linewidth)
315
+ # Add a circle around it for better visibility
316
+ draw.ellipse((x - cross_size * 2, y - cross_size * 2, x + cross_size * 2, y + cross_size * 2), outline="red", width=linewidth)
317
+
318
  # Create a filename with step number
319
  screenshot_path = os.path.join(self.data_dir, f"step_{current_step:03d}.png")
320
  image.save(screenshot_path)
 
335
 
336
  # memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
337
 
338
+ self.click_coordinates = None
339
+
340
 
341
  def close(self):
342
  """Clean up resources"""