Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -18,6 +18,10 @@ from smolagents.agents import ActionStep
|
|
18 |
from tools.search_item_ctrl_f import SearchItemCtrlFTool
|
19 |
from tools.go_back import GoBackTool
|
20 |
from tools.close_popups import ClosePopupsTool
|
|
|
|
|
|
|
|
|
21 |
from tools.final_answer import FinalAnswerTool
|
22 |
from GRADIO_UI import GradioUI
|
23 |
|
@@ -31,18 +35,12 @@ logger.debug("Configuring litellm for gemini/gemini-2.0-flash")
|
|
31 |
# Load environment variables
|
32 |
load_dotenv()
|
33 |
hf_token = os.getenv("HF_TOKEN")
|
34 |
-
|
35 |
-
|
36 |
-
# Verify Gemini API key
|
37 |
-
if not gemini_api_key:
|
38 |
-
raise ValueError("GEMINI_API_KEY not found in environment variables. Please set it in .env.")
|
39 |
-
logger.debug(f"GEMINI_API_KEY found: {bool(os.getenv('GEMINI_API_KEY'))}")
|
40 |
|
41 |
# Warn about Anthropic key
|
42 |
if os.getenv("ANTHROPIC_API_KEY"):
|
43 |
logger.warning("ANTHROPIC_API_KEY found in environment. This may cause conflicts.")
|
44 |
|
45 |
-
|
46 |
login(hf_token, add_to_git_credential=False)
|
47 |
|
48 |
# Initialize Chrome driver
|
@@ -56,6 +54,7 @@ try:
|
|
56 |
chrome_options.add_argument("--window-position=0,0")
|
57 |
chrome_options.add_argument("--headless=new")
|
58 |
driver = webdriver.Chrome(options=chrome_options)
|
|
|
59 |
helium.set_driver(driver)
|
60 |
logger.info("Chrome driver initialized successfully.")
|
61 |
except Exception as e:
|
@@ -69,11 +68,12 @@ def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> str:
|
|
69 |
driver = helium.get_driver()
|
70 |
current_step = memory_step.step_number
|
71 |
if driver is not None:
|
72 |
-
# Clear old screenshots from earlier steps
|
73 |
for previous_memory_step in agent.memory.steps:
|
74 |
if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number < current_step:
|
75 |
previous_memory_step.observations_images = None
|
76 |
-
|
|
|
|
|
77 |
png_bytes = driver.get_screenshot_as_png()
|
78 |
image = Image.open(BytesIO(png_bytes))
|
79 |
screenshot_dir = os.path.join(tempfile.gettempdir(), "web_agent_screenshots")
|
@@ -82,8 +82,8 @@ def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> str:
|
|
82 |
screenshot_filename = f"screenshot_step_{current_step}_{timestamp}.png"
|
83 |
screenshot_path = os.path.join(screenshot_dir, screenshot_filename)
|
84 |
image.save(screenshot_path)
|
|
|
85 |
logger.info(f"Saved screenshot to: {screenshot_path}")
|
86 |
-
# Update observations
|
87 |
url_info = f"Current url: {driver.current_url}\nScreenshot saved at: {screenshot_path}"
|
88 |
memory_step.observations = (
|
89 |
url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
|
@@ -102,11 +102,15 @@ tools = [
|
|
102 |
SearchItemCtrlFTool(driver=driver),
|
103 |
GoBackTool(driver=driver),
|
104 |
ClosePopupsTool(driver=driver),
|
|
|
|
|
|
|
|
|
105 |
FinalAnswerTool()
|
106 |
]
|
107 |
|
108 |
# Initialize model
|
109 |
-
model = LiteLLMModel("gemini/gemini-2.0-flash")
|
110 |
|
111 |
# Initialize agent
|
112 |
agent = CodeAgent(
|
@@ -129,7 +133,9 @@ agent = CodeAgent(
|
|
129 |
"queue",
|
130 |
"time",
|
131 |
"collections",
|
132 |
-
"re"
|
|
|
|
|
133 |
]
|
134 |
)
|
135 |
agent.python_executor("from helium import *")
|
@@ -139,4 +145,4 @@ try:
|
|
139 |
GradioUI(agent).launch()
|
140 |
except KeyboardInterrupt:
|
141 |
driver.quit()
|
142 |
-
logger.info("Chrome driver closed on exit.")
|
|
|
18 |
from tools.search_item_ctrl_f import SearchItemCtrlFTool
|
19 |
from tools.go_back import GoBackTool
|
20 |
from tools.close_popups import ClosePopupsTool
|
21 |
+
from tools.scroll_page import tool as ScrollPageTool
|
22 |
+
from tools.scrape_text import tool as ScrapeTextTool
|
23 |
+
from tools.interact_element import tool as InteractElementTool
|
24 |
+
from tools.detect_elements import tool as DetectElementsTool
|
25 |
from tools.final_answer import FinalAnswerTool
|
26 |
from GRADIO_UI import GradioUI
|
27 |
|
|
|
35 |
# Load environment variables
|
36 |
load_dotenv()
|
37 |
hf_token = os.getenv("HF_TOKEN")
|
38 |
+
default_gemini_api_key = os.getenv("GOOGLE_API_KEY")
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
# Warn about Anthropic key
|
41 |
if os.getenv("ANTHROPIC_API_KEY"):
|
42 |
logger.warning("ANTHROPIC_API_KEY found in environment. This may cause conflicts.")
|
43 |
|
|
|
44 |
login(hf_token, add_to_git_credential=False)
|
45 |
|
46 |
# Initialize Chrome driver
|
|
|
54 |
chrome_options.add_argument("--window-position=0,0")
|
55 |
chrome_options.add_argument("--headless=new")
|
56 |
driver = webdriver.Chrome(options=chrome_options)
|
57 |
+
driver.implicitly_wait(5)
|
58 |
helium.set_driver(driver)
|
59 |
logger.info("Chrome driver initialized successfully.")
|
60 |
except Exception as e:
|
|
|
68 |
driver = helium.get_driver()
|
69 |
current_step = memory_step.step_number
|
70 |
if driver is not None:
|
|
|
71 |
for previous_memory_step in agent.memory.steps:
|
72 |
if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number < current_step:
|
73 |
previous_memory_step.observations_images = None
|
74 |
+
original_size = driver.get_window_size()
|
75 |
+
total_height = driver.execute_script("return document.body.scrollHeight")
|
76 |
+
driver.set_window_size(original_size['width'], total_height)
|
77 |
png_bytes = driver.get_screenshot_as_png()
|
78 |
image = Image.open(BytesIO(png_bytes))
|
79 |
screenshot_dir = os.path.join(tempfile.gettempdir(), "web_agent_screenshots")
|
|
|
82 |
screenshot_filename = f"screenshot_step_{current_step}_{timestamp}.png"
|
83 |
screenshot_path = os.path.join(screenshot_dir, screenshot_filename)
|
84 |
image.save(screenshot_path)
|
85 |
+
driver.set_window_size(original_size['width'], original_size['height'])
|
86 |
logger.info(f"Saved screenshot to: {screenshot_path}")
|
|
|
87 |
url_info = f"Current url: {driver.current_url}\nScreenshot saved at: {screenshot_path}"
|
88 |
memory_step.observations = (
|
89 |
url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
|
|
|
102 |
SearchItemCtrlFTool(driver=driver),
|
103 |
GoBackTool(driver=driver),
|
104 |
ClosePopupsTool(driver=driver),
|
105 |
+
ScrollPageTool,
|
106 |
+
ScrapeTextTool,
|
107 |
+
InteractElementTool,
|
108 |
+
DetectElementsTool,
|
109 |
FinalAnswerTool()
|
110 |
]
|
111 |
|
112 |
# Initialize model
|
113 |
+
model = LiteLLMModel("gemini/gemini-2.0-flash", api_key=default_gemini_api_key)
|
114 |
|
115 |
# Initialize agent
|
116 |
agent = CodeAgent(
|
|
|
133 |
"queue",
|
134 |
"time",
|
135 |
"collections",
|
136 |
+
"re",
|
137 |
+
"cv2",
|
138 |
+
"numpy"
|
139 |
]
|
140 |
)
|
141 |
agent.python_executor("from helium import *")
|
|
|
145 |
GradioUI(agent).launch()
|
146 |
except KeyboardInterrupt:
|
147 |
driver.quit()
|
148 |
+
logger.info("Chrome driver closed on exit.")
|