Spaces:

Firoj112
/

WebAgents_

Running

App Files Files Community

Firoj112 commited on 9 days ago

Commit

0a194bd

verified ·

1 Parent(s): d7878bd

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -13

app.py CHANGED Viewed

@@ -18,6 +18,10 @@ from smolagents.agents import ActionStep
 from tools.search_item_ctrl_f import SearchItemCtrlFTool
 from tools.go_back import GoBackTool
 from tools.close_popups import ClosePopupsTool
 from tools.final_answer import FinalAnswerTool
 from GRADIO_UI import GradioUI
@@ -31,18 +35,12 @@ logger.debug("Configuring litellm for gemini/gemini-2.0-flash")
 # Load environment variables
 load_dotenv()
 hf_token = os.getenv("HF_TOKEN")
-gemini_api_key = os.getenv("GOOGLE_API_KEY")
-# Verify Gemini API key
-if not gemini_api_key:
-    raise ValueError("GEMINI_API_KEY not found in environment variables. Please set it in .env.")
-logger.debug(f"GEMINI_API_KEY found: {bool(os.getenv('GEMINI_API_KEY'))}")
 # Warn about Anthropic key
 if os.getenv("ANTHROPIC_API_KEY"):
     logger.warning("ANTHROPIC_API_KEY found in environment. This may cause conflicts.")
 login(hf_token, add_to_git_credential=False)
 # Initialize Chrome driver
@@ -56,6 +54,7 @@ try:
     chrome_options.add_argument("--window-position=0,0")
     chrome_options.add_argument("--headless=new")
     driver = webdriver.Chrome(options=chrome_options)
     helium.set_driver(driver)
     logger.info("Chrome driver initialized successfully.")
 except Exception as e:
@@ -69,11 +68,12 @@ def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> str:
     driver = helium.get_driver()
     current_step = memory_step.step_number
     if driver is not None:
-        # Clear old screenshots from earlier steps
         for previous_memory_step in agent.memory.steps:
             if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number < current_step:
                 previous_memory_step.observations_images = None
-        # Save new screenshot
         png_bytes = driver.get_screenshot_as_png()
         image = Image.open(BytesIO(png_bytes))
         screenshot_dir = os.path.join(tempfile.gettempdir(), "web_agent_screenshots")
@@ -82,8 +82,8 @@ def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> str:
         screenshot_filename = f"screenshot_step_{current_step}_{timestamp}.png"
         screenshot_path = os.path.join(screenshot_dir, screenshot_filename)
         image.save(screenshot_path)
         logger.info(f"Saved screenshot to: {screenshot_path}")
-        # Update observations
         url_info = f"Current url: {driver.current_url}\nScreenshot saved at: {screenshot_path}"
         memory_step.observations = (
             url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
@@ -102,11 +102,15 @@ tools = [
     SearchItemCtrlFTool(driver=driver),
     GoBackTool(driver=driver),
     ClosePopupsTool(driver=driver),
     FinalAnswerTool()
 ]
 # Initialize model
-model = LiteLLMModel("gemini/gemini-2.0-flash")
 # Initialize agent
 agent = CodeAgent(
@@ -129,7 +133,9 @@ agent = CodeAgent(
         "queue",
         "time",
         "collections",
-        "re"
     ]
 )
 agent.python_executor("from helium import *")
@@ -139,4 +145,4 @@ try:
     GradioUI(agent).launch()
 except KeyboardInterrupt:
     driver.quit()
-    logger.info("Chrome driver closed on exit.")

 from tools.search_item_ctrl_f import SearchItemCtrlFTool
 from tools.go_back import GoBackTool
 from tools.close_popups import ClosePopupsTool
+from tools.scroll_page import tool as ScrollPageTool
+from tools.scrape_text import tool as ScrapeTextTool
+from tools.interact_element import tool as InteractElementTool
+from tools.detect_elements import tool as DetectElementsTool
 from tools.final_answer import FinalAnswerTool
 from GRADIO_UI import GradioUI
 # Load environment variables
 load_dotenv()
 hf_token = os.getenv("HF_TOKEN")
+default_gemini_api_key = os.getenv("GOOGLE_API_KEY")
 # Warn about Anthropic key
 if os.getenv("ANTHROPIC_API_KEY"):
     logger.warning("ANTHROPIC_API_KEY found in environment. This may cause conflicts.")
 login(hf_token, add_to_git_credential=False)
 # Initialize Chrome driver
     chrome_options.add_argument("--window-position=0,0")
     chrome_options.add_argument("--headless=new")
     driver = webdriver.Chrome(options=chrome_options)
+    driver.implicitly_wait(5)
     helium.set_driver(driver)
     logger.info("Chrome driver initialized successfully.")
 except Exception as e:
     driver = helium.get_driver()
     current_step = memory_step.step_number
     if driver is not None:
         for previous_memory_step in agent.memory.steps:
             if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number < current_step:
                 previous_memory_step.observations_images = None
+        original_size = driver.get_window_size()
+        total_height = driver.execute_script("return document.body.scrollHeight")
+        driver.set_window_size(original_size['width'], total_height)
         png_bytes = driver.get_screenshot_as_png()
         image = Image.open(BytesIO(png_bytes))
         screenshot_dir = os.path.join(tempfile.gettempdir(), "web_agent_screenshots")
         screenshot_filename = f"screenshot_step_{current_step}_{timestamp}.png"
         screenshot_path = os.path.join(screenshot_dir, screenshot_filename)
         image.save(screenshot_path)
+        driver.set_window_size(original_size['width'], original_size['height'])
         logger.info(f"Saved screenshot to: {screenshot_path}")
         url_info = f"Current url: {driver.current_url}\nScreenshot saved at: {screenshot_path}"
         memory_step.observations = (
             url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
     SearchItemCtrlFTool(driver=driver),
     GoBackTool(driver=driver),
     ClosePopupsTool(driver=driver),
+    ScrollPageTool,
+    ScrapeTextTool,
+    InteractElementTool,
+    DetectElementsTool,
     FinalAnswerTool()
 ]
 # Initialize model
+model = LiteLLMModel("gemini/gemini-2.0-flash", api_key=default_gemini_api_key)
 # Initialize agent
 agent = CodeAgent(
         "queue",
         "time",
         "collections",
+        "re",
+        "cv2",
+        "numpy"
     ]
 )
 agent.python_executor("from helium import *")
     GradioUI(agent).launch()
 except KeyboardInterrupt:
     driver.quit()
+    logger.info("Chrome driver closed on exit.")