WebAgents_ / tools /scrape_text.py
Firoj112's picture
Create scrape_text.py
e9ed5be verified
raw
history blame
2.21 kB
from smolagents.tools import Tool
from helium import S
from selenium.webdriver.common.by import By
import json
def scrape_text(driver, selector="p", extract_table=False):
"""
Scrape text or table data from elements matching a CSS selector on the current page.
Args:
driver: Selenium WebDriver instance
selector (str): CSS selector to target elements (default: 'p' for paragraphs)
extract_table (bool): If True, extract table data as JSON (default: False)
Returns:
str or dict: Text from elements or JSON table data
"""
try:
if extract_table:
tables = driver.find_elements(By.CSS_SELECTOR, selector)
if not tables:
return "No tables found for selector"
table_data = []
for table in tables:
rows = table.find_elements(By.TAG_NAME, "tr")
table_rows = []
for row in rows:
cells = row.find_elements(By.TAG_NAME, "td") or row.find_elements(By.TAG_NAME, "th")
row_data = [cell.text.strip() for cell in cells if cell.text.strip()]
if row_data:
table_rows.append(row_data)
if table_rows:
table_data.append(table_rows)
return json.dumps(table_data) if table_data else "No table data found"
else:
elements = driver.find_elements(By.CSS_SELECTOR, selector)
text_list = [element.text.strip() for element in elements if element.text.strip()]
return "\n".join(text_list) if text_list else "No text found for selector"
except Exception as e:
return f"Failed to scrape with selector {selector}: {str(e)}"
# Register the tool
tool = Tool(
name="scrape_text",
description="Scrapes text or table data from elements matching a CSS selector on the current page.",
inputs={
"selector": {"type": "str", "default": "p", "description": "CSS selector to target elements"},
"extract_table": {"type": "bool", "default": False, "description": "If True, extract table data as JSON"}
},
output_type="str",
function=scrape_text
)