Firoj112 commited on
Commit
5dee8aa
·
verified ·
1 Parent(s): ec6a434

Update tools/scrape_text.py

Browse files
Files changed (1) hide show
  1. tools/scrape_text.py +37 -54
tools/scrape_text.py CHANGED
@@ -3,58 +3,41 @@ from helium import S
3
  from selenium.webdriver.common.by import By
4
  import json
5
 
6
- def scrape_text(driver, selector="p", extract_table=False):
7
- """
8
- Scrape text or table data from elements matching a CSS selector on the current page.
9
-
10
- Args:
11
- driver: Selenium WebDriver instance
12
- selector (str): CSS selector to target elements (default: 'p' for paragraphs)
13
- extract_table (bool): If True, extract table data as JSON (default: False)
14
-
15
- Returns:
16
- str or dict: Text from elements or JSON table data
17
- """
18
- try:
19
- if extract_table:
20
- tables = driver.find_elements(By.CSS_SELECTOR, selector)
21
- if not tables:
22
- return "No tables found for selector"
23
- table_data = []
24
- for table in tables:
25
- rows = table.find_elements(By.TAG_NAME, "tr")
26
- table_rows = []
27
- for row in rows:
28
- cells = row.find_elements(By.TAG_NAME, "td") or row.find_elements(By.TAG_NAME, "th")
29
- row_data = [cell.text.strip() for cell in cells if cell.text.strip()]
30
- if row_data:
31
- table_rows.append(row_data)
32
- if table_rows:
33
- table_data.append(table_rows)
34
- return json.dumps(table_data) if table_data else "No table data found"
35
- else:
36
- elements = driver.find_elements(By.CSS_SELECTOR, selector)
37
- text_list = [element.text.strip() for element in elements if element.text.strip()]
38
- return "\n".join(text_list) if text_list else "No text found for selector"
39
- except Exception as e:
40
- return f"Failed to scrape with selector {selector}: {str(e)}"
41
 
42
- # Register the tool
43
- scrape_text_tool = Tool(
44
- name="scrape_text",
45
- description="Scrapes text or table data from elements matching a CSS selector on the current page.",
46
- inputs={
47
- "selector": {
48
- "type": "str",
49
- "default": "p",
50
- "description": "CSS selector to target elements"
51
- },
52
- "extract_table": {
53
- "type": "bool",
54
- "default": False,
55
- "description": "If True, extract table data as JSON"
56
- }
57
- },
58
- output_type="str",
59
- function=scrape_text
60
- )
 
 
 
 
 
 
 
 
 
 
 
3
  from selenium.webdriver.common.by import By
4
  import json
5
 
6
+ class ScrapeTextTool(Tool):
7
+ name = "scrape_text"
8
+ description = "Scrapes text or table data from elements matching a CSS selector on the current page."
9
+ inputs = {
10
+ "selector": {"type": "str", "default": "p", "description": "CSS selector to target elements"},
11
+ "extract_table": {"type": "bool", "default": False, "description": "If True, extract table data as JSON"}
12
+ }
13
+ output_type = "str"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ def __init__(self, driver):
16
+ self.driver = driver
17
+
18
+ def forward(self, **kwargs):
19
+ selector = kwargs.get("selector", "p")
20
+ extract_table = kwargs.get("extract_table", False)
21
+ try:
22
+ if extract_table:
23
+ tables = self.driver.find_elements(By.CSS_SELECTOR, selector)
24
+ if not tables:
25
+ return "No tables found for selector"
26
+ table_data = []
27
+ for table in tables:
28
+ rows = table.find_elements(By.TAG_NAME, "tr")
29
+ table_rows = []
30
+ for row in rows:
31
+ cells = row.find_elements(By.TAG_NAME, "td") or row.find_elements(By.TAG_NAME, "th")
32
+ row_data = [cell.text.strip() for cell in cells if cell.text.strip()]
33
+ if row_data:
34
+ table_rows.append(row_data)
35
+ if table_rows:
36
+ table_data.append(table_rows)
37
+ return json.dumps(table_data) if table_data else "No table data found"
38
+ else:
39
+ elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
40
+ text_list = [element.text.strip() for element in elements if element.text.strip()]
41
+ return "\n".join(text_list) if text_list else "No text found for selector"
42
+ except Exception as e:
43
+ return f"Failed to scrape with selector {selector}: {str(e)}"