Firoj112 commited on
Commit
0a194bd
·
verified ·
1 Parent(s): d7878bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -13
app.py CHANGED
@@ -18,6 +18,10 @@ from smolagents.agents import ActionStep
18
  from tools.search_item_ctrl_f import SearchItemCtrlFTool
19
  from tools.go_back import GoBackTool
20
  from tools.close_popups import ClosePopupsTool
 
 
 
 
21
  from tools.final_answer import FinalAnswerTool
22
  from GRADIO_UI import GradioUI
23
 
@@ -31,18 +35,12 @@ logger.debug("Configuring litellm for gemini/gemini-2.0-flash")
31
  # Load environment variables
32
  load_dotenv()
33
  hf_token = os.getenv("HF_TOKEN")
34
- gemini_api_key = os.getenv("GOOGLE_API_KEY")
35
-
36
- # Verify Gemini API key
37
- if not gemini_api_key:
38
- raise ValueError("GEMINI_API_KEY not found in environment variables. Please set it in .env.")
39
- logger.debug(f"GEMINI_API_KEY found: {bool(os.getenv('GEMINI_API_KEY'))}")
40
 
41
  # Warn about Anthropic key
42
  if os.getenv("ANTHROPIC_API_KEY"):
43
  logger.warning("ANTHROPIC_API_KEY found in environment. This may cause conflicts.")
44
 
45
-
46
  login(hf_token, add_to_git_credential=False)
47
 
48
  # Initialize Chrome driver
@@ -56,6 +54,7 @@ try:
56
  chrome_options.add_argument("--window-position=0,0")
57
  chrome_options.add_argument("--headless=new")
58
  driver = webdriver.Chrome(options=chrome_options)
 
59
  helium.set_driver(driver)
60
  logger.info("Chrome driver initialized successfully.")
61
  except Exception as e:
@@ -69,11 +68,12 @@ def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> str:
69
  driver = helium.get_driver()
70
  current_step = memory_step.step_number
71
  if driver is not None:
72
- # Clear old screenshots from earlier steps
73
  for previous_memory_step in agent.memory.steps:
74
  if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number < current_step:
75
  previous_memory_step.observations_images = None
76
- # Save new screenshot
 
 
77
  png_bytes = driver.get_screenshot_as_png()
78
  image = Image.open(BytesIO(png_bytes))
79
  screenshot_dir = os.path.join(tempfile.gettempdir(), "web_agent_screenshots")
@@ -82,8 +82,8 @@ def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> str:
82
  screenshot_filename = f"screenshot_step_{current_step}_{timestamp}.png"
83
  screenshot_path = os.path.join(screenshot_dir, screenshot_filename)
84
  image.save(screenshot_path)
 
85
  logger.info(f"Saved screenshot to: {screenshot_path}")
86
- # Update observations
87
  url_info = f"Current url: {driver.current_url}\nScreenshot saved at: {screenshot_path}"
88
  memory_step.observations = (
89
  url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
@@ -102,11 +102,15 @@ tools = [
102
  SearchItemCtrlFTool(driver=driver),
103
  GoBackTool(driver=driver),
104
  ClosePopupsTool(driver=driver),
 
 
 
 
105
  FinalAnswerTool()
106
  ]
107
 
108
  # Initialize model
109
- model = LiteLLMModel("gemini/gemini-2.0-flash")
110
 
111
  # Initialize agent
112
  agent = CodeAgent(
@@ -129,7 +133,9 @@ agent = CodeAgent(
129
  "queue",
130
  "time",
131
  "collections",
132
- "re"
 
 
133
  ]
134
  )
135
  agent.python_executor("from helium import *")
@@ -139,4 +145,4 @@ try:
139
  GradioUI(agent).launch()
140
  except KeyboardInterrupt:
141
  driver.quit()
142
- logger.info("Chrome driver closed on exit.")
 
18
  from tools.search_item_ctrl_f import SearchItemCtrlFTool
19
  from tools.go_back import GoBackTool
20
  from tools.close_popups import ClosePopupsTool
21
+ from tools.scroll_page import tool as ScrollPageTool
22
+ from tools.scrape_text import tool as ScrapeTextTool
23
+ from tools.interact_element import tool as InteractElementTool
24
+ from tools.detect_elements import tool as DetectElementsTool
25
  from tools.final_answer import FinalAnswerTool
26
  from GRADIO_UI import GradioUI
27
 
 
35
  # Load environment variables
36
  load_dotenv()
37
  hf_token = os.getenv("HF_TOKEN")
38
+ default_gemini_api_key = os.getenv("GOOGLE_API_KEY")
 
 
 
 
 
39
 
40
  # Warn about Anthropic key
41
  if os.getenv("ANTHROPIC_API_KEY"):
42
  logger.warning("ANTHROPIC_API_KEY found in environment. This may cause conflicts.")
43
 
 
44
  login(hf_token, add_to_git_credential=False)
45
 
46
  # Initialize Chrome driver
 
54
  chrome_options.add_argument("--window-position=0,0")
55
  chrome_options.add_argument("--headless=new")
56
  driver = webdriver.Chrome(options=chrome_options)
57
+ driver.implicitly_wait(5)
58
  helium.set_driver(driver)
59
  logger.info("Chrome driver initialized successfully.")
60
  except Exception as e:
 
68
  driver = helium.get_driver()
69
  current_step = memory_step.step_number
70
  if driver is not None:
 
71
  for previous_memory_step in agent.memory.steps:
72
  if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number < current_step:
73
  previous_memory_step.observations_images = None
74
+ original_size = driver.get_window_size()
75
+ total_height = driver.execute_script("return document.body.scrollHeight")
76
+ driver.set_window_size(original_size['width'], total_height)
77
  png_bytes = driver.get_screenshot_as_png()
78
  image = Image.open(BytesIO(png_bytes))
79
  screenshot_dir = os.path.join(tempfile.gettempdir(), "web_agent_screenshots")
 
82
  screenshot_filename = f"screenshot_step_{current_step}_{timestamp}.png"
83
  screenshot_path = os.path.join(screenshot_dir, screenshot_filename)
84
  image.save(screenshot_path)
85
+ driver.set_window_size(original_size['width'], original_size['height'])
86
  logger.info(f"Saved screenshot to: {screenshot_path}")
 
87
  url_info = f"Current url: {driver.current_url}\nScreenshot saved at: {screenshot_path}"
88
  memory_step.observations = (
89
  url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
 
102
  SearchItemCtrlFTool(driver=driver),
103
  GoBackTool(driver=driver),
104
  ClosePopupsTool(driver=driver),
105
+ ScrollPageTool,
106
+ ScrapeTextTool,
107
+ InteractElementTool,
108
+ DetectElementsTool,
109
  FinalAnswerTool()
110
  ]
111
 
112
  # Initialize model
113
+ model = LiteLLMModel("gemini/gemini-2.0-flash", api_key=default_gemini_api_key)
114
 
115
  # Initialize agent
116
  agent = CodeAgent(
 
133
  "queue",
134
  "time",
135
  "collections",
136
+ "re",
137
+ "cv2",
138
+ "numpy"
139
  ]
140
  )
141
  agent.python_executor("from helium import *")
 
145
  GradioUI(agent).launch()
146
  except KeyboardInterrupt:
147
  driver.quit()
148
+ logger.info("Chrome driver closed on exit.")