WebAgents_ / tools /scrape_text.py
Firoj112's picture
Update tools/scrape_text.py
b26ce75 verified
from smolagents.tools import Tool
from helium import S
from selenium.webdriver.common.by import By
import json
import logging
logger = logging.getLogger(__name__)
class ScrapeTextTool(Tool):
name = "scrape_text"
description = "Scrapes text or table data from elements matching a CSS selector on the current page."
inputs = {
"selector": {"type": "string", "default": "p", "nullable": False, "description": "CSS selector to target elements"},
"extract_table": {"type": "boolean", "default": False, "nullable": False, "description": "If True, extract table data as JSON"}
}
output_type = "string"
def __init__(self, driver):
super().__init__()
self.driver = driver
self.is_initialized = self.driver is not None
logger.debug(f"ScrapeTextTool initialized: is_initialized={self.is_initialized}")
def forward(self, selector="p", extract_table=False):
if not self.is_initialized:
return "Error: ScrapeTextTool is not initialized"
try:
if extract_table:
tables = self.driver.find_elements(By.CSS_SELECTOR, selector)
if not tables:
return "No tables found for selector"
table_data = []
for table in tables:
rows = table.find_elements(By.TAG_NAME, "tr")
table_rows = []
for row in rows:
cells = row.find_elements(By.TAG_NAME, "td") or row.find_elements(By.TAG_NAME, "th")
row_data = [cell.text.strip() for cell in cells if cell.text.strip()]
if row_data:
table_rows.append(row_data)
if table_rows:
table_data.append(table_rows)
return json.dumps(table_data) if table_data else "No table data found"
else:
elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
text_list = [element.text.strip() for element in elements if element.text.strip()]
return "\n".join(text_list) if text_list else "No text found for selector"
except Exception as e:
return f"Failed to scrape with selector {selector}: {str(e)}"