Firoj112 commited on
Commit
5c0ab7d
·
verified ·
1 Parent(s): 69dbdbd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -143
app.py CHANGED
@@ -1,18 +1,24 @@
1
  import os
2
- import gradio as gr
3
- from smolagents import CodeAgent, LiteLLMModel, tool
4
- from smolagents.agents import ActionStep
5
- import helium
 
6
  from selenium import webdriver
7
  from selenium.webdriver.common.by import By
8
  from selenium.webdriver.common.keys import Keys
9
  from io import BytesIO
10
  from PIL import Image
11
  from datetime import datetime
12
- from dotenv import load_dotenv
13
- from huggingface_hub import login
14
  import tempfile
15
- import logging
 
 
 
 
 
 
 
16
 
17
  # Set up logging
18
  logging.basicConfig(level=logging.INFO)
@@ -30,44 +36,6 @@ if not gemini_api_key:
30
 
31
  login(hf_token, add_to_git_credential=False)
32
 
33
- # Debug ChromeDriver path
34
- chromedriver_path = '/usr/bin/chromedriver'
35
- logger.info(f"Checking ChromeDriver at: {chromedriver_path}")
36
- logger.info(f"ChromeDriver exists: {os.path.exists(chromedriver_path)}")
37
- logger.info(f"ChromeDriver executable: {os.access(chromedriver_path, os.X_OK)}")
38
- logger.info(f"System PATH: {os.environ.get('PATH')}")
39
-
40
- # Define tools
41
- @tool
42
- def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
43
- """
44
- Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
45
- Args:
46
- text: The text to search for
47
- nth_result: Which occurrence to jump to (default: 1)
48
- """
49
- elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
50
- if nth_result > len(elements):
51
- raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
52
- result = f"Found {len(elements)} matches for '{text}'."
53
- elem = elements[nth_result - 1]
54
- driver.execute_script("arguments[0].scrollIntoView(true);", elem)
55
- result += f"Focused on element {nth_result} of {len(elements)}"
56
- return result
57
-
58
- @tool
59
- def go_back() -> None:
60
- """Goes back to previous page."""
61
- driver.back()
62
-
63
- @tool
64
- def close_popups() -> str:
65
- """
66
- Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows!
67
- This does not work on cookie consent banners.
68
- """
69
- webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
70
-
71
  # Initialize Chrome driver
72
  try:
73
  chrome_options = webdriver.ChromeOptions()
@@ -113,108 +81,53 @@ def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> str:
113
  )
114
  return screenshot_path
115
 
116
- # Initialize model and agent
117
- model = LiteLLMModel("gemini/gemini-2.0-flash")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  agent = CodeAgent(
119
- tools=[go_back, close_popups, search_item_ctrl_f],
120
  model=model,
121
- additional_authorized_imports=["helium"],
122
- step_callbacks=[save_screenshot],
123
  max_steps=20,
124
  verbosity_level=2,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  )
126
  agent.python_executor("from helium import *")
127
 
128
- # Helium instructions
129
- helium_instructions = """
130
- You can use helium to access websites. Don't bother about the helium driver, it's already managed.
131
- We've already ran "from helium import *"
132
- Then you can go to pages!
133
- Code:
134
- go_to('github.com/trending')
135
- ```<end_code>
136
- You can directly click clickable elements by inputting the text that appears on them.
137
- Code:
138
- click("Top products")
139
- ```<end_code>
140
- If it's a link:
141
- Code:
142
- click(Link("Top products"))
143
- ```<end_code>
144
- If you try to interact with an element and it's not found, you'll get a LookupError.
145
- In general stop your action after each button click to see what happens on your screenshot.
146
- Never try to login in a page.
147
- To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
148
- Code:
149
- scroll_down(num_pixels=1200)
150
- ```<end_code>
151
- When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element.
152
- Just use your built-in tool `close_popups` to close them:
153
- Code:
154
- close_popups()
155
- ```<end_code>
156
- You can use .exists() to check for the existence of an element. For example:
157
- Code:
158
- if Text('Accept cookies?').exists():
159
- click('I accept')
160
- ```<end_code>
161
- """
162
-
163
- # Chatbot interface function
164
- def run_agent_chat(user_input: str, history: list):
165
- try:
166
- # Extract URL and request from user input or history
167
- if "http" in user_input:
168
- url = user_input.split()[0] if user_input.startswith("http") else next((w for w in user_input.split() if w.startswith("http")), "")
169
- request = user_input.replace(url, "").strip() or "Navigate to the URL and describe the page."
170
- else:
171
- url = "[invalid url, do not cite]"
172
- request = user_input
173
-
174
- search_request = f"Please go to {url}. {request}"
175
- agent_output = agent.run(search_request + helium_instructions)
176
-
177
- # Collect the latest screenshot path from observations
178
- latest_screenshot = None
179
- for step in reversed(agent.memory.steps):
180
- if isinstance(step, ActionStep) and step.observations:
181
- # Extract screenshot path from observations
182
- for line in step.observations.split("\n"):
183
- if line.startswith("Screenshot saved at:"):
184
- latest_screenshot = line.replace("Screenshot saved at: ", "").strip()
185
- break
186
- if latest_screenshot:
187
- break
188
-
189
- # Format output for chatbot
190
- output = f"**Agent Output:**\n{agent_output}"
191
- if latest_screenshot:
192
- output += f"\n\n**Latest Screenshot:**"
193
-
194
- return output, latest_screenshot
195
- except Exception as e:
196
- logger.error(f"Agent execution failed: {str(e)}")
197
- return f"Error: {str(e)}", None
198
-
199
- # Custom Gradio interface
200
- def process_input(user_input, history):
201
- if not user_input.strip():
202
- return history, None
203
- output, latest_screenshot = run_agent_chat(user_input, history)
204
- new_history = history + [[user_input, output]]
205
- return new_history, latest_screenshot
206
-
207
- if __name__ == "__main__":
208
- with gr.Blocks() as demo:
209
- gr.Markdown("# Web Navigation Agent")
210
- chatbot = gr.Chatbot(label="Chat")
211
- msg = gr.Textbox(placeholder="Enter URL and request (e.g., [invalid url, do not cite] Click on Developers)")
212
- btn = gr.Button("Send")
213
- image = gr.Image(label="Latest Screenshot")
214
- btn.click(process_input, inputs=[msg, chatbot], outputs=[chatbot, image])
215
- msg.submit(process_input, inputs=[msg, chatbot], outputs=[chatbot, image])
216
- try:
217
- demo.launch()
218
- except KeyboardInterrupt:
219
- driver.quit()
220
- logger.info("Chrome driver closed on exit.")
 
1
  import os
2
+ import json
3
+ import yaml
4
+ import logging
5
+ from dotenv import load_dotenv
6
+ from huggingface_hub import login
7
  from selenium import webdriver
8
  from selenium.webdriver.common.by import By
9
  from selenium.webdriver.common.keys import Keys
10
  from io import BytesIO
11
  from PIL import Image
12
  from datetime import datetime
 
 
13
  import tempfile
14
+ import helium
15
+ from smolagents import CodeAgent, LiteLLMModel
16
+ from smolagents.agents import ActionStep
17
+ from tools.search_item_ctrl_f import SearchItemCtrlFTool
18
+ from tools.go_back import GoBackTool
19
+ from tools.close_popups import ClosePopupsTool
20
+ from tools.final_answer import FinalAnswerTool
21
+ from GRADIO_UI import GradioUI
22
 
23
  # Set up logging
24
  logging.basicConfig(level=logging.INFO)
 
36
 
37
  login(hf_token, add_to_git_credential=False)
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  # Initialize Chrome driver
40
  try:
41
  chrome_options = webdriver.ChromeOptions()
 
81
  )
82
  return screenshot_path
83
 
84
+ # Load prompt templates
85
+ try:
86
+ with open("prompts.yaml", 'r') as stream:
87
+ prompt_templates = yaml.safe_load(stream)
88
+ except FileNotFoundError:
89
+ prompt_templates = {}
90
+
91
+ # Initialize tools
92
+ tools = [
93
+ SearchItemCtrlFTool(driver=driver),
94
+ GoBackTool(driver=driver),
95
+ ClosePopupsTool(driver=driver),
96
+ FinalAnswerTool()
97
+ ]
98
+
99
+ # Initialize model
100
+ model = LiteLLMModel(model_name="gemini/gemini-2.0-flash", api_key=gemini_api_key, max_tokens=2096, temperature=0.5)
101
+
102
+ # Initialize agent
103
  agent = CodeAgent(
 
104
  model=model,
105
+ tools=tools,
 
106
  max_steps=20,
107
  verbosity_level=2,
108
+ prompt_templates=prompt_templates,
109
+ step_callbacks=[save_screenshot],
110
+ authorized_imports=[
111
+ "helium",
112
+ "unicodedata",
113
+ "stat",
114
+ "datetime",
115
+ "random",
116
+ "pandas",
117
+ "itertools",
118
+ "math",
119
+ "statistics",
120
+ "queue",
121
+ "time",
122
+ "collections",
123
+ "re"
124
+ ]
125
  )
126
  agent.python_executor("from helium import *")
127
 
128
+ # Launch Gradio UI
129
+ try:
130
+ GradioUI(agent, file_upload_folder=os.path.join(tempfile.gettempdir(), "uploads")).launch()
131
+ except KeyboardInterrupt:
132
+ driver.quit()
133
+ logger.info("Chrome driver closed on exit.")