Spaces:

Firoj112
/

WebAgents_

Running

App Files Files Community

Firoj112 commited on 13 days ago

Commit

2c4cfd9

verified ·

1 Parent(s): 6c6b900

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -32

app.py CHANGED Viewed

@@ -1,19 +1,17 @@
 import os
 import gradio as gr
-from smolagents import CodeAgent, LiteLLMModel, tool
 from smolagents.agents import ActionStep
 import helium
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.common.keys import Keys
 from io import BytesIO
-from google import genai
 from PIL import Image
 from datetime import datetime
 from dotenv import load_dotenv
 from huggingface_hub import login
 import tempfile
-from google.genai import types
 import logging
 # Set up logging
@@ -33,7 +31,7 @@ if not gemini_api_key:
 login(hf_token, add_to_git_credential=False)
 # Debug ChromeDriver path
-chromedriver_path = '/usr/bin/chromedriver'  # Expected path for chromium-driver
 logger.info(f"Checking ChromeDriver at: {chromedriver_path}")
 logger.info(f"ChromeDriver exists: {os.path.exists(chromedriver_path)}")
 logger.info(f"ChromeDriver executable: {os.access(chromedriver_path, os.X_OK)}")
@@ -94,20 +92,26 @@ def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> Image.Image:
     driver = helium.get_driver()
     current_step = memory_step.step_number
     if driver is not None:
         for previous_memory_step in agent.memory.steps:
-            if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
                 previous_memory_step.observations_images = None
         png_bytes = driver.get_screenshot_as_png()
         image = Image.open(BytesIO(png_bytes))
-        screenshot_dir = tempfile.gettempdir()
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        screenshot_path = f"{screenshot_dir}/screenshot_step_{current_step}_{timestamp}.png"
         image.save(screenshot_path)
         logger.info(f"Saved screenshot to: {screenshot_path}")
         url_info = f"Current url: {driver.current_url}"
         memory_step.observations = (
             url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
         )
         return image
 # Initialize model and agent
@@ -143,9 +147,9 @@ In general stop your action after each button click to see what happens on your
 Never try to login in a page.
 To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
 Code:
-scroll_down(num_pixels=1200) # This will scroll one viewport down
 ```<end_code>
-When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
 Just use your built-in tool `close_popups` to close them:
 Code:
 close_popups()
@@ -157,17 +161,33 @@ if Text('Accept cookies?').exists():
 ```<end_code>
 """
-# Gradio interface function
-def run_agent(url: str, request: str):
     try:
         search_request = f"Please go to {url}. {request}"
         agent_output = agent.run(search_request + helium_instructions)
-        screenshot_path = next(
-            (f"{tempfile.gettempdir()}/screenshot_step_{step.step_number}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
-             for step in agent.memory.steps if isinstance(step, ActionStep) and step.observations_images),
-            None
-        )
-        return agent_output, screenshot_path
     except Exception as e:
         logger.error(f"Agent execution failed: {str(e)}")
         return f"Error: {str(e)}", None
@@ -178,19 +198,6 @@ def run_agent(url: str, request: str):
         except:
             logger.warning("Failed to close Chrome driver.")
-# Gradio interface
-with gr.Blocks() as demo:
-    gr.Markdown("# Web Navigation Agent")
-    url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com")
-    request_input = gr.Textbox(label="Enter Request", placeholder="Describe what to do on the website")
-    submit_button = gr.Button("Run Agent")
-    output_text = gr.Textbox(label="Agent Output")
-    output_image = gr.Image(label="Screenshot")
-    submit_button.click(
-        fn=run_agent,
-        inputs=[url_input, request_input],
-        outputs=[output_text, output_image]
-    )
 if __name__ == "__main__":
-    demo.launch()

 import os
 import gradio as gr
+from smolagents import CodeAgent, LiteLLMModel, tool, GradioUI
 from smolagents.agents import ActionStep
 import helium
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.common.keys import Keys
 from io import BytesIO
 from PIL import Image
 from datetime import datetime
 from dotenv import load_dotenv
 from huggingface_hub import login
 import tempfile
 import logging
 # Set up logging
 login(hf_token, add_to_git_credential=False)
 # Debug ChromeDriver path
+chromedriver_path = '/usr/bin/chromedriver'
 logger.info(f"Checking ChromeDriver at: {chromedriver_path}")
 logger.info(f"ChromeDriver exists: {os.path.exists(chromedriver_path)}")
 logger.info(f"ChromeDriver executable: {os.access(chromedriver_path, os.X_OK)}")
     driver = helium.get_driver()
     current_step = memory_step.step_number
     if driver is not None:
+        # Clear old screenshots from earlier steps
         for previous_memory_step in agent.memory.steps:
+            if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number < current_step:
                 previous_memory_step.observations_images = None
+        # Save new screenshot
         png_bytes = driver.get_screenshot_as_png()
         image = Image.open(BytesIO(png_bytes))
+        screenshot_dir = os.path.join(tempfile.gettempdir(), "web_agent_screenshots")
+        os.makedirs(screenshot_dir, exist_ok=True)
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        screenshot_filename = f"screenshot_step_{current_step}_{timestamp}.png"
+        screenshot_path = os.path.join(screenshot_dir, screenshot_filename)
         image.save(screenshot_path)
         logger.info(f"Saved screenshot to: {screenshot_path}")
+        # Update observations
         url_info = f"Current url: {driver.current_url}"
         memory_step.observations = (
             url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
         )
+        memory_step.observations_images = screenshot_path
         return image
 # Initialize model and agent
 Never try to login in a page.
 To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
 Code:
+scroll_down(num_pixels=1200)
 ```<end_code>
+When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element.
 Just use your built-in tool `close_popups` to close them:
 Code:
 close_popups()
 ```<end_code>
 """
+# Chatbot interface function
+def run_agent_chat(user_input: str, history: list):
     try:
+        # Extract URL and request from user input or history
+        if "http" in user_input:
+            url = user_input.split()[0] if user_input.startswith("http") else next((w for w in user_input.split() if w.startswith("http")), "")
+            request = user_input.replace(url, "").strip() or "Navigate to the URL and describe the page."
+        else:
+            url = "https://example.com"  # Default URL if none provided
+            request = user_input
         search_request = f"Please go to {url}. {request}"
         agent_output = agent.run(search_request + helium_instructions)
+        # Collect the latest screenshot
+        latest_screenshot = None
+        for step in reversed(agent.memory.steps):
+            if isinstance(step, ActionStep) and step.observations_images:
+                latest_screenshot = step.observations_images
+                break
+        # Format output for chatbot
+        output = f"**Agent Output:**\n{agent_output}"
+        if latest_screenshot:
+            output += f"\n\n**Latest Screenshot:**"
+        return output, latest_screenshot
     except Exception as e:
         logger.error(f"Agent execution failed: {str(e)}")
         return f"Error: {str(e)}", None
         except:
             logger.warning("Failed to close Chrome driver.")
+# Launch GradioUI
 if __name__ == "__main__":
+    GradioUI(agent, run_agent_chat).launch()