Firoj112 commited on
Commit
2c4cfd9
·
verified ·
1 Parent(s): 6c6b900

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -32
app.py CHANGED
@@ -1,19 +1,17 @@
1
  import os
2
  import gradio as gr
3
- from smolagents import CodeAgent, LiteLLMModel, tool
4
  from smolagents.agents import ActionStep
5
  import helium
6
  from selenium import webdriver
7
  from selenium.webdriver.common.by import By
8
  from selenium.webdriver.common.keys import Keys
9
  from io import BytesIO
10
- from google import genai
11
  from PIL import Image
12
  from datetime import datetime
13
  from dotenv import load_dotenv
14
  from huggingface_hub import login
15
  import tempfile
16
- from google.genai import types
17
  import logging
18
 
19
  # Set up logging
@@ -33,7 +31,7 @@ if not gemini_api_key:
33
  login(hf_token, add_to_git_credential=False)
34
 
35
  # Debug ChromeDriver path
36
- chromedriver_path = '/usr/bin/chromedriver' # Expected path for chromium-driver
37
  logger.info(f"Checking ChromeDriver at: {chromedriver_path}")
38
  logger.info(f"ChromeDriver exists: {os.path.exists(chromedriver_path)}")
39
  logger.info(f"ChromeDriver executable: {os.access(chromedriver_path, os.X_OK)}")
@@ -94,20 +92,26 @@ def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> Image.Image:
94
  driver = helium.get_driver()
95
  current_step = memory_step.step_number
96
  if driver is not None:
 
97
  for previous_memory_step in agent.memory.steps:
98
- if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
99
  previous_memory_step.observations_images = None
 
100
  png_bytes = driver.get_screenshot_as_png()
101
  image = Image.open(BytesIO(png_bytes))
102
- screenshot_dir = tempfile.gettempdir()
 
103
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
104
- screenshot_path = f"{screenshot_dir}/screenshot_step_{current_step}_{timestamp}.png"
 
105
  image.save(screenshot_path)
106
  logger.info(f"Saved screenshot to: {screenshot_path}")
 
107
  url_info = f"Current url: {driver.current_url}"
108
  memory_step.observations = (
109
  url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
110
  )
 
111
  return image
112
 
113
  # Initialize model and agent
@@ -143,9 +147,9 @@ In general stop your action after each button click to see what happens on your
143
  Never try to login in a page.
144
  To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
145
  Code:
146
- scroll_down(num_pixels=1200) # This will scroll one viewport down
147
  ```<end_code>
148
- When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
149
  Just use your built-in tool `close_popups` to close them:
150
  Code:
151
  close_popups()
@@ -157,17 +161,33 @@ if Text('Accept cookies?').exists():
157
  ```<end_code>
158
  """
159
 
160
- # Gradio interface function
161
- def run_agent(url: str, request: str):
162
  try:
 
 
 
 
 
 
 
 
163
  search_request = f"Please go to {url}. {request}"
164
  agent_output = agent.run(search_request + helium_instructions)
165
- screenshot_path = next(
166
- (f"{tempfile.gettempdir()}/screenshot_step_{step.step_number}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
167
- for step in agent.memory.steps if isinstance(step, ActionStep) and step.observations_images),
168
- None
169
- )
170
- return agent_output, screenshot_path
 
 
 
 
 
 
 
 
171
  except Exception as e:
172
  logger.error(f"Agent execution failed: {str(e)}")
173
  return f"Error: {str(e)}", None
@@ -178,19 +198,6 @@ def run_agent(url: str, request: str):
178
  except:
179
  logger.warning("Failed to close Chrome driver.")
180
 
181
- # Gradio interface
182
- with gr.Blocks() as demo:
183
- gr.Markdown("# Web Navigation Agent")
184
- url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com")
185
- request_input = gr.Textbox(label="Enter Request", placeholder="Describe what to do on the website")
186
- submit_button = gr.Button("Run Agent")
187
- output_text = gr.Textbox(label="Agent Output")
188
- output_image = gr.Image(label="Screenshot")
189
- submit_button.click(
190
- fn=run_agent,
191
- inputs=[url_input, request_input],
192
- outputs=[output_text, output_image]
193
- )
194
-
195
  if __name__ == "__main__":
196
- demo.launch()
 
1
  import os
2
  import gradio as gr
3
+ from smolagents import CodeAgent, LiteLLMModel, tool, GradioUI
4
  from smolagents.agents import ActionStep
5
  import helium
6
  from selenium import webdriver
7
  from selenium.webdriver.common.by import By
8
  from selenium.webdriver.common.keys import Keys
9
  from io import BytesIO
 
10
  from PIL import Image
11
  from datetime import datetime
12
  from dotenv import load_dotenv
13
  from huggingface_hub import login
14
  import tempfile
 
15
  import logging
16
 
17
  # Set up logging
 
31
  login(hf_token, add_to_git_credential=False)
32
 
33
  # Debug ChromeDriver path
34
+ chromedriver_path = '/usr/bin/chromedriver'
35
  logger.info(f"Checking ChromeDriver at: {chromedriver_path}")
36
  logger.info(f"ChromeDriver exists: {os.path.exists(chromedriver_path)}")
37
  logger.info(f"ChromeDriver executable: {os.access(chromedriver_path, os.X_OK)}")
 
92
  driver = helium.get_driver()
93
  current_step = memory_step.step_number
94
  if driver is not None:
95
+ # Clear old screenshots from earlier steps
96
  for previous_memory_step in agent.memory.steps:
97
+ if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number < current_step:
98
  previous_memory_step.observations_images = None
99
+ # Save new screenshot
100
  png_bytes = driver.get_screenshot_as_png()
101
  image = Image.open(BytesIO(png_bytes))
102
+ screenshot_dir = os.path.join(tempfile.gettempdir(), "web_agent_screenshots")
103
+ os.makedirs(screenshot_dir, exist_ok=True)
104
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
105
+ screenshot_filename = f"screenshot_step_{current_step}_{timestamp}.png"
106
+ screenshot_path = os.path.join(screenshot_dir, screenshot_filename)
107
  image.save(screenshot_path)
108
  logger.info(f"Saved screenshot to: {screenshot_path}")
109
+ # Update observations
110
  url_info = f"Current url: {driver.current_url}"
111
  memory_step.observations = (
112
  url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
113
  )
114
+ memory_step.observations_images = screenshot_path
115
  return image
116
 
117
  # Initialize model and agent
 
147
  Never try to login in a page.
148
  To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
149
  Code:
150
+ scroll_down(num_pixels=1200)
151
  ```<end_code>
152
+ When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element.
153
  Just use your built-in tool `close_popups` to close them:
154
  Code:
155
  close_popups()
 
161
  ```<end_code>
162
  """
163
 
164
+ # Chatbot interface function
165
+ def run_agent_chat(user_input: str, history: list):
166
  try:
167
+ # Extract URL and request from user input or history
168
+ if "http" in user_input:
169
+ url = user_input.split()[0] if user_input.startswith("http") else next((w for w in user_input.split() if w.startswith("http")), "")
170
+ request = user_input.replace(url, "").strip() or "Navigate to the URL and describe the page."
171
+ else:
172
+ url = "https://example.com" # Default URL if none provided
173
+ request = user_input
174
+
175
  search_request = f"Please go to {url}. {request}"
176
  agent_output = agent.run(search_request + helium_instructions)
177
+
178
+ # Collect the latest screenshot
179
+ latest_screenshot = None
180
+ for step in reversed(agent.memory.steps):
181
+ if isinstance(step, ActionStep) and step.observations_images:
182
+ latest_screenshot = step.observations_images
183
+ break
184
+
185
+ # Format output for chatbot
186
+ output = f"**Agent Output:**\n{agent_output}"
187
+ if latest_screenshot:
188
+ output += f"\n\n**Latest Screenshot:**"
189
+
190
+ return output, latest_screenshot
191
  except Exception as e:
192
  logger.error(f"Agent execution failed: {str(e)}")
193
  return f"Error: {str(e)}", None
 
198
  except:
199
  logger.warning("Failed to close Chrome driver.")
200
 
201
+ # Launch GradioUI
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  if __name__ == "__main__":
203
+ GradioUI(agent, run_agent_chat).launch()