Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,19 +1,17 @@
|
|
1 |
import os
|
2 |
import gradio as gr
|
3 |
-
from smolagents import CodeAgent, LiteLLMModel, tool
|
4 |
from smolagents.agents import ActionStep
|
5 |
import helium
|
6 |
from selenium import webdriver
|
7 |
from selenium.webdriver.common.by import By
|
8 |
from selenium.webdriver.common.keys import Keys
|
9 |
from io import BytesIO
|
10 |
-
from google import genai
|
11 |
from PIL import Image
|
12 |
from datetime import datetime
|
13 |
from dotenv import load_dotenv
|
14 |
from huggingface_hub import login
|
15 |
import tempfile
|
16 |
-
from google.genai import types
|
17 |
import logging
|
18 |
|
19 |
# Set up logging
|
@@ -33,7 +31,7 @@ if not gemini_api_key:
|
|
33 |
login(hf_token, add_to_git_credential=False)
|
34 |
|
35 |
# Debug ChromeDriver path
|
36 |
-
chromedriver_path = '/usr/bin/chromedriver'
|
37 |
logger.info(f"Checking ChromeDriver at: {chromedriver_path}")
|
38 |
logger.info(f"ChromeDriver exists: {os.path.exists(chromedriver_path)}")
|
39 |
logger.info(f"ChromeDriver executable: {os.access(chromedriver_path, os.X_OK)}")
|
@@ -94,20 +92,26 @@ def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> Image.Image:
|
|
94 |
driver = helium.get_driver()
|
95 |
current_step = memory_step.step_number
|
96 |
if driver is not None:
|
|
|
97 |
for previous_memory_step in agent.memory.steps:
|
98 |
-
if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number
|
99 |
previous_memory_step.observations_images = None
|
|
|
100 |
png_bytes = driver.get_screenshot_as_png()
|
101 |
image = Image.open(BytesIO(png_bytes))
|
102 |
-
screenshot_dir = tempfile.gettempdir()
|
|
|
103 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
104 |
-
|
|
|
105 |
image.save(screenshot_path)
|
106 |
logger.info(f"Saved screenshot to: {screenshot_path}")
|
|
|
107 |
url_info = f"Current url: {driver.current_url}"
|
108 |
memory_step.observations = (
|
109 |
url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
|
110 |
)
|
|
|
111 |
return image
|
112 |
|
113 |
# Initialize model and agent
|
@@ -143,9 +147,9 @@ In general stop your action after each button click to see what happens on your
|
|
143 |
Never try to login in a page.
|
144 |
To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
|
145 |
Code:
|
146 |
-
scroll_down(num_pixels=1200)
|
147 |
```<end_code>
|
148 |
-
When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element
|
149 |
Just use your built-in tool `close_popups` to close them:
|
150 |
Code:
|
151 |
close_popups()
|
@@ -157,17 +161,33 @@ if Text('Accept cookies?').exists():
|
|
157 |
```<end_code>
|
158 |
"""
|
159 |
|
160 |
-
#
|
161 |
-
def
|
162 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
search_request = f"Please go to {url}. {request}"
|
164 |
agent_output = agent.run(search_request + helium_instructions)
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
except Exception as e:
|
172 |
logger.error(f"Agent execution failed: {str(e)}")
|
173 |
return f"Error: {str(e)}", None
|
@@ -178,19 +198,6 @@ def run_agent(url: str, request: str):
|
|
178 |
except:
|
179 |
logger.warning("Failed to close Chrome driver.")
|
180 |
|
181 |
-
#
|
182 |
-
with gr.Blocks() as demo:
|
183 |
-
gr.Markdown("# Web Navigation Agent")
|
184 |
-
url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com")
|
185 |
-
request_input = gr.Textbox(label="Enter Request", placeholder="Describe what to do on the website")
|
186 |
-
submit_button = gr.Button("Run Agent")
|
187 |
-
output_text = gr.Textbox(label="Agent Output")
|
188 |
-
output_image = gr.Image(label="Screenshot")
|
189 |
-
submit_button.click(
|
190 |
-
fn=run_agent,
|
191 |
-
inputs=[url_input, request_input],
|
192 |
-
outputs=[output_text, output_image]
|
193 |
-
)
|
194 |
-
|
195 |
if __name__ == "__main__":
|
196 |
-
|
|
|
1 |
import os
|
2 |
import gradio as gr
|
3 |
+
from smolagents import CodeAgent, LiteLLMModel, tool, GradioUI
|
4 |
from smolagents.agents import ActionStep
|
5 |
import helium
|
6 |
from selenium import webdriver
|
7 |
from selenium.webdriver.common.by import By
|
8 |
from selenium.webdriver.common.keys import Keys
|
9 |
from io import BytesIO
|
|
|
10 |
from PIL import Image
|
11 |
from datetime import datetime
|
12 |
from dotenv import load_dotenv
|
13 |
from huggingface_hub import login
|
14 |
import tempfile
|
|
|
15 |
import logging
|
16 |
|
17 |
# Set up logging
|
|
|
31 |
login(hf_token, add_to_git_credential=False)
|
32 |
|
33 |
# Debug ChromeDriver path
|
34 |
+
chromedriver_path = '/usr/bin/chromedriver'
|
35 |
logger.info(f"Checking ChromeDriver at: {chromedriver_path}")
|
36 |
logger.info(f"ChromeDriver exists: {os.path.exists(chromedriver_path)}")
|
37 |
logger.info(f"ChromeDriver executable: {os.access(chromedriver_path, os.X_OK)}")
|
|
|
92 |
driver = helium.get_driver()
|
93 |
current_step = memory_step.step_number
|
94 |
if driver is not None:
|
95 |
+
# Clear old screenshots from earlier steps
|
96 |
for previous_memory_step in agent.memory.steps:
|
97 |
+
if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number < current_step:
|
98 |
previous_memory_step.observations_images = None
|
99 |
+
# Save new screenshot
|
100 |
png_bytes = driver.get_screenshot_as_png()
|
101 |
image = Image.open(BytesIO(png_bytes))
|
102 |
+
screenshot_dir = os.path.join(tempfile.gettempdir(), "web_agent_screenshots")
|
103 |
+
os.makedirs(screenshot_dir, exist_ok=True)
|
104 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
105 |
+
screenshot_filename = f"screenshot_step_{current_step}_{timestamp}.png"
|
106 |
+
screenshot_path = os.path.join(screenshot_dir, screenshot_filename)
|
107 |
image.save(screenshot_path)
|
108 |
logger.info(f"Saved screenshot to: {screenshot_path}")
|
109 |
+
# Update observations
|
110 |
url_info = f"Current url: {driver.current_url}"
|
111 |
memory_step.observations = (
|
112 |
url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
|
113 |
)
|
114 |
+
memory_step.observations_images = screenshot_path
|
115 |
return image
|
116 |
|
117 |
# Initialize model and agent
|
|
|
147 |
Never try to login in a page.
|
148 |
To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
|
149 |
Code:
|
150 |
+
scroll_down(num_pixels=1200)
|
151 |
```<end_code>
|
152 |
+
When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element.
|
153 |
Just use your built-in tool `close_popups` to close them:
|
154 |
Code:
|
155 |
close_popups()
|
|
|
161 |
```<end_code>
|
162 |
"""
|
163 |
|
164 |
+
# Chatbot interface function
|
165 |
+
def run_agent_chat(user_input: str, history: list):
|
166 |
try:
|
167 |
+
# Extract URL and request from user input or history
|
168 |
+
if "http" in user_input:
|
169 |
+
url = user_input.split()[0] if user_input.startswith("http") else next((w for w in user_input.split() if w.startswith("http")), "")
|
170 |
+
request = user_input.replace(url, "").strip() or "Navigate to the URL and describe the page."
|
171 |
+
else:
|
172 |
+
url = "https://example.com" # Default URL if none provided
|
173 |
+
request = user_input
|
174 |
+
|
175 |
search_request = f"Please go to {url}. {request}"
|
176 |
agent_output = agent.run(search_request + helium_instructions)
|
177 |
+
|
178 |
+
# Collect the latest screenshot
|
179 |
+
latest_screenshot = None
|
180 |
+
for step in reversed(agent.memory.steps):
|
181 |
+
if isinstance(step, ActionStep) and step.observations_images:
|
182 |
+
latest_screenshot = step.observations_images
|
183 |
+
break
|
184 |
+
|
185 |
+
# Format output for chatbot
|
186 |
+
output = f"**Agent Output:**\n{agent_output}"
|
187 |
+
if latest_screenshot:
|
188 |
+
output += f"\n\n**Latest Screenshot:**"
|
189 |
+
|
190 |
+
return output, latest_screenshot
|
191 |
except Exception as e:
|
192 |
logger.error(f"Agent execution failed: {str(e)}")
|
193 |
return f"Error: {str(e)}", None
|
|
|
198 |
except:
|
199 |
logger.warning("Failed to close Chrome driver.")
|
200 |
|
201 |
+
# Launch GradioUI
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
if __name__ == "__main__":
|
203 |
+
GradioUI(agent, run_agent_chat).launch()
|