Sun Tao commited on
Commit
fbacfe2
·
1 Parent(s): a093f82

Update web_toolkit.py

Browse files
Files changed (1) hide show
  1. owl/camel/toolkits/web_toolkit.py +50 -35
owl/camel/toolkits/web_toolkit.py CHANGED
@@ -5,7 +5,6 @@ from playwright._impl._errors import TimeoutError
5
  from loguru import logger
6
  from typing import Any, Dict, List, TypedDict, Union, BinaryIO
7
  from PIL import Image, ImageDraw, ImageFont
8
- from firecrawl import FirecrawlApp
9
  from html2text import html2text
10
  from retry import retry
11
  from copy import deepcopy
@@ -16,6 +15,7 @@ from camel.messages import BaseMessage
16
  from camel.agents import ChatAgent
17
  from camel.models import ModelFactory, BaseModelBackend
18
  from camel.types import ModelType, ModelPlatformType
 
19
 
20
  import io
21
  import random
@@ -49,7 +49,8 @@ AVAILABLE_ACTIONS_PROMPT = """
49
 
50
  ACTION_WITH_FEEDBACK_LIST = [
51
  'ask_question_about_video',
52
- 'download_file_id'
 
53
  ]
54
 
55
 
@@ -127,18 +128,19 @@ def _parse_json_output(text: str) -> Dict[str, Any]:
127
  fixed_text = re.sub(r'`([^`]*)`', r'"\1"', text)
128
  return json.loads(fixed_text)
129
  except json.JSONDecodeError:
 
130
  result = {}
131
  try:
132
  bool_pattern = r'"(\w+)"\s*:\s*(true|false)'
133
  for match in re.finditer(bool_pattern, text, re.IGNORECASE):
134
  key, value = match.groups()
135
  result[key] = value.lower() == "true"
136
-
137
  str_pattern = r'"(\w+)"\s*:\s*"([^"]*)"'
138
  for match in re.finditer(str_pattern, text):
139
  key, value = match.groups()
140
  result[key] = value
141
-
142
  num_pattern = r'"(\w+)"\s*:\s*(-?\d+(?:\.\d+)?)'
143
  for match in re.finditer(num_pattern, text):
144
  key, value = match.groups()
@@ -221,7 +223,6 @@ def add_set_of_mark(
221
  if isinstance(screenshot, bytes):
222
  screenshot = io.BytesIO(screenshot)
223
 
224
- # TODO: Not sure why this cast was needed, but by this point screenshot is a binary file-like object
225
  image = Image.open(cast(BinaryIO, screenshot))
226
  comp, visible_rects, rects_above, rects_below = _add_set_of_mark(image, ROIs)
227
  image.close()
@@ -300,8 +301,7 @@ def _color(identifier: int) -> Tuple[int, int, int, int]:
300
  class BaseBrowser:
301
  def __init__(self,
302
  headless=True,
303
- cache_dir: Optional[str] = None,
304
- page_script_path: Optional[str] = None):
305
  r"""Initialize the WebBrowserToolkit instance.
306
 
307
  Args:
@@ -326,25 +326,23 @@ class BaseBrowser:
326
 
327
  self.page_history = [] # stores the history of visited pages
328
 
329
-
330
  # set the cache directory
331
  self.cache_dir = "tmp/"
332
  os.makedirs(self.cache_dir, exist_ok=True)
333
  if cache_dir is not None:
334
  self.cache_dir = cache_dir
335
-
336
  # load the page script
337
- if page_script_path is None:
338
- abs_dir_path = os.path.dirname(os.path.abspath(__file__))
339
- page_script_path = os.path.join(abs_dir_path, "page_script.js")
340
 
341
  try:
342
  with open(page_script_path, "r", encoding='utf-8') as f:
343
  self.page_script = f.read()
344
  f.close()
345
  except FileNotFoundError:
346
- logger.warning(f"Page script file not found: {page_script_path}")
347
-
348
 
349
  def init(self):
350
  r"""Initialize the browser."""
@@ -367,6 +365,7 @@ class BaseBrowser:
367
  # self.page.wait_for_load_state("networkidle", timeout=timeout_ms)
368
  # self.page.wait_for_load_state("domcontentloaded", timeout=timeout_ms)
369
  time.sleep(2)
 
370
 
371
  def click_blank_area(self):
372
  r"""Click a blank area of the page to unfocus the current element."""
@@ -531,7 +530,7 @@ class BaseBrowser:
531
  comp.save(f, "PNG")
532
  f.close()
533
 
534
- return comp, file_path
535
 
536
 
537
  def scroll_up(self) -> None:
@@ -551,13 +550,11 @@ class BaseBrowser:
551
  identifier = str(identifier)
552
  target = self.page.locator(f"[__elementId='{identifier}']")
553
 
554
- # See if it exists
555
  try:
556
  target.wait_for(timeout=5000)
557
  except TimeoutError:
558
  raise ValueError("No such element.") from None
559
 
560
- # Click it
561
  target.scroll_into_view_if_needed()
562
 
563
  new_page = None
@@ -571,15 +568,15 @@ class BaseBrowser:
571
  if new_page:
572
  self.page_history.append(deepcopy(self.page.url))
573
  self.page = new_page
 
574
  except PlaywrightError:
575
  pass
576
- # new_page = self
577
  self._wait_for_load()
578
 
579
 
580
  def extract_url_content(self):
581
  r"""Extract the content of the current page."""
582
- # TODO: update it using firecrawl
583
  content = self.page.content()
584
  return content
585
 
@@ -757,7 +754,6 @@ class BaseBrowser:
757
  }
758
  """)
759
 
760
-
761
  @retry(requests.RequestException)
762
  def get_webpage_content(self) -> str:
763
  self._wait_for_load()
@@ -767,22 +763,32 @@ class BaseBrowser:
767
  return markdown_content
768
 
769
 
770
-
771
  class WebToolkit(BaseToolkit):
 
 
 
 
772
  def __init__(self,
773
- headless=True,
774
  cache_dir: Optional[str] = None,
775
- page_script_path: Optional[str] = None,
776
  history_window: int = 5,
777
  web_agent_model: Optional[BaseModelBackend] = None,
778
  planning_agent_model: Optional[BaseModelBackend] = None,
779
- output_language: str = "en"
780
  ):
 
 
 
 
 
 
 
 
 
781
 
782
  self.browser = BaseBrowser(
783
  headless=headless,
784
- cache_dir=cache_dir,
785
- page_script_path=page_script_path
786
  )
787
 
788
  self.history_window = history_window
@@ -791,7 +797,6 @@ class WebToolkit(BaseToolkit):
791
  self.output_language = output_language
792
 
793
  self.history = []
794
- # self.search_toolkit = SearchToolkit()
795
  self.web_agent, self.planning_agent = self._initialize_agent()
796
 
797
 
@@ -915,6 +920,19 @@ Here are some tips for you:
915
  observation_result: str = resp_dict.get("observation", "")
916
  reasoning_result: str = resp_dict.get("reasoning", "")
917
  action_code: str = resp_dict.get("action_code", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
918
  action_code = action_code.replace("`", "").strip()
919
 
920
  return observation_result, reasoning_result, action_code
@@ -961,8 +979,6 @@ Here are some tips for you:
961
  r"""Get the final answer based on the task prompt and current browser state.
962
  It is used when the agent thinks that the task can be completed without any further action, and answer can be directly found in the current viewport.
963
  """
964
- # screenshot, _ = self.browser.get_screenshot()
965
- # img = _reload_image(screenshot)
966
 
967
  prompt = f"""
968
  We are solving a complex web task which needs multi-step browser interaction. After the multi-step observation, reasoning and acting with web browser, we think that the task is currently solved.
@@ -1078,37 +1094,36 @@ Your output should be in json format, including the following fields:
1078
  return False, replanned_schema
1079
 
1080
 
 
1081
  def browser_simulation(self,
1082
  task_prompt: str,
1083
  start_url: str,
 
1084
  ) -> str:
1085
  r"""A powerful toolkit which can simulate the browser interaction to solve the task which needs multi-step actions.
1086
 
1087
  Args:
1088
  task_prompt (str): The task prompt to solve.
1089
  start_url (str): The start URL to visit.
 
1090
 
1091
  Returns:
1092
  str: The simulation result to the task.
1093
  """
1094
-
1095
- ROUND_LIMIT = 12
1096
-
1097
  self._reset()
1098
  task_completed = False
1099
-
1100
  detailed_plan = self._task_planning(task_prompt, start_url)
1101
  logger.debug(f"Detailed plan: {detailed_plan}")
1102
 
1103
  self.browser.init()
1104
  self.browser.visit_page(start_url)
1105
 
1106
- for i in range(ROUND_LIMIT):
1107
  observation, reasoning, action_code = self._observe(task_prompt, detailed_plan)
1108
  logger.debug(f"Observation: {observation}")
1109
  logger.debug(f"Reasoning: {reasoning}")
1110
  logger.debug(f"Action code: {action_code}")
1111
- # breakpoint()
1112
 
1113
  if "stop" in action_code:
1114
  task_completed = True
 
5
  from loguru import logger
6
  from typing import Any, Dict, List, TypedDict, Union, BinaryIO
7
  from PIL import Image, ImageDraw, ImageFont
 
8
  from html2text import html2text
9
  from retry import retry
10
  from copy import deepcopy
 
15
  from camel.agents import ChatAgent
16
  from camel.models import ModelFactory, BaseModelBackend
17
  from camel.types import ModelType, ModelPlatformType
18
+ from camel.utils import dependencies_required
19
 
20
  import io
21
  import random
 
49
 
50
  ACTION_WITH_FEEDBACK_LIST = [
51
  'ask_question_about_video',
52
+ 'download_file_id',
53
+ 'find_text_on_page',
54
  ]
55
 
56
 
 
128
  fixed_text = re.sub(r'`([^`]*)`', r'"\1"', text)
129
  return json.loads(fixed_text)
130
  except json.JSONDecodeError:
131
+ # Try to extract key fields
132
  result = {}
133
  try:
134
  bool_pattern = r'"(\w+)"\s*:\s*(true|false)'
135
  for match in re.finditer(bool_pattern, text, re.IGNORECASE):
136
  key, value = match.groups()
137
  result[key] = value.lower() == "true"
138
+
139
  str_pattern = r'"(\w+)"\s*:\s*"([^"]*)"'
140
  for match in re.finditer(str_pattern, text):
141
  key, value = match.groups()
142
  result[key] = value
143
+
144
  num_pattern = r'"(\w+)"\s*:\s*(-?\d+(?:\.\d+)?)'
145
  for match in re.finditer(num_pattern, text):
146
  key, value = match.groups()
 
223
  if isinstance(screenshot, bytes):
224
  screenshot = io.BytesIO(screenshot)
225
 
 
226
  image = Image.open(cast(BinaryIO, screenshot))
227
  comp, visible_rects, rects_above, rects_below = _add_set_of_mark(image, ROIs)
228
  image.close()
 
301
  class BaseBrowser:
302
  def __init__(self,
303
  headless=True,
304
+ cache_dir: Optional[str] = None):
 
305
  r"""Initialize the WebBrowserToolkit instance.
306
 
307
  Args:
 
326
 
327
  self.page_history = [] # stores the history of visited pages
328
 
 
329
  # set the cache directory
330
  self.cache_dir = "tmp/"
331
  os.makedirs(self.cache_dir, exist_ok=True)
332
  if cache_dir is not None:
333
  self.cache_dir = cache_dir
334
+
335
  # load the page script
336
+ abs_dir_path = os.path.dirname(os.path.abspath(__file__))
337
+ page_script_path = os.path.join(abs_dir_path, "page_script.js")
 
338
 
339
  try:
340
  with open(page_script_path, "r", encoding='utf-8') as f:
341
  self.page_script = f.read()
342
  f.close()
343
  except FileNotFoundError:
344
+ raise FileNotFoundError(f"Page script file not found at path: {page_script_path}")
345
+
346
 
347
  def init(self):
348
  r"""Initialize the browser."""
 
365
  # self.page.wait_for_load_state("networkidle", timeout=timeout_ms)
366
  # self.page.wait_for_load_state("domcontentloaded", timeout=timeout_ms)
367
  time.sleep(2)
368
+
369
 
370
  def click_blank_area(self):
371
  r"""Click a blank area of the page to unfocus the current element."""
 
530
  comp.save(f, "PNG")
531
  f.close()
532
 
533
+ return comp, file_path
534
 
535
 
536
  def scroll_up(self) -> None:
 
550
  identifier = str(identifier)
551
  target = self.page.locator(f"[__elementId='{identifier}']")
552
 
 
553
  try:
554
  target.wait_for(timeout=5000)
555
  except TimeoutError:
556
  raise ValueError("No such element.") from None
557
 
 
558
  target.scroll_into_view_if_needed()
559
 
560
  new_page = None
 
568
  if new_page:
569
  self.page_history.append(deepcopy(self.page.url))
570
  self.page = new_page
571
+
572
  except PlaywrightError:
573
  pass
574
+
575
  self._wait_for_load()
576
 
577
 
578
  def extract_url_content(self):
579
  r"""Extract the content of the current page."""
 
580
  content = self.page.content()
581
  return content
582
 
 
754
  }
755
  """)
756
 
 
757
  @retry(requests.RequestException)
758
  def get_webpage_content(self) -> str:
759
  self._wait_for_load()
 
763
  return markdown_content
764
 
765
 
 
766
  class WebToolkit(BaseToolkit):
767
+ r"""A class for browsing the web and interacting with web pages.
768
+
769
+ This class provides methods for browsing the web and interacting with web pages.
770
+ """
771
  def __init__(self,
772
+ headless: bool = True,
773
  cache_dir: Optional[str] = None,
 
774
  history_window: int = 5,
775
  web_agent_model: Optional[BaseModelBackend] = None,
776
  planning_agent_model: Optional[BaseModelBackend] = None,
777
+ output_language: str = "en",
778
  ):
779
+ r"""Initialize the WebToolkit instance.
780
+
781
+ Args:
782
+ headless (bool): Whether to run the browser in headless mode.
783
+ cache_dir (Union[str, None]): The directory to store cache files.
784
+ history_window (int): The window size for storing the history of actions.
785
+ web_agent_model (Optional[BaseModelBackend]): The model backend for the web agent.
786
+ planning_agent_model (Optional[BaseModelBackend]): The model backend for the planning agent.
787
+ """
788
 
789
  self.browser = BaseBrowser(
790
  headless=headless,
791
+ cache_dir=cache_dir
 
792
  )
793
 
794
  self.history_window = history_window
 
797
  self.output_language = output_language
798
 
799
  self.history = []
 
800
  self.web_agent, self.planning_agent = self._initialize_agent()
801
 
802
 
 
920
  observation_result: str = resp_dict.get("observation", "")
921
  reasoning_result: str = resp_dict.get("reasoning", "")
922
  action_code: str = resp_dict.get("action_code", "")
923
+
924
+ if action_code and "(" in action_code and ")" not in action_code:
925
+ action_match = re.search(r'"action_code"\s*:\s*[`"]([^`"]*\([^)]*\))[`"]', resp_content)
926
+ if action_match:
927
+ action_code = action_match.group(1)
928
+ else:
929
+ logger.warning(f"Incomplete action_code detected: {action_code}")
930
+ if action_code.startswith("fill_input_id("):
931
+ parts = action_code.split(",", 1)
932
+ if len(parts) > 1:
933
+ id_part = parts[0].replace("fill_input_id(", "").strip()
934
+ action_code = f'fill_input_id({id_part}, "Please fill the text here.")'
935
+
936
  action_code = action_code.replace("`", "").strip()
937
 
938
  return observation_result, reasoning_result, action_code
 
979
  r"""Get the final answer based on the task prompt and current browser state.
980
  It is used when the agent thinks that the task can be completed without any further action, and answer can be directly found in the current viewport.
981
  """
 
 
982
 
983
  prompt = f"""
984
  We are solving a complex web task which needs multi-step browser interaction. After the multi-step observation, reasoning and acting with web browser, we think that the task is currently solved.
 
1094
  return False, replanned_schema
1095
 
1096
 
1097
+ @dependencies_required("playwright")
1098
  def browser_simulation(self,
1099
  task_prompt: str,
1100
  start_url: str,
1101
+ round_limit: int = 12
1102
  ) -> str:
1103
  r"""A powerful toolkit which can simulate the browser interaction to solve the task which needs multi-step actions.
1104
 
1105
  Args:
1106
  task_prompt (str): The task prompt to solve.
1107
  start_url (str): The start URL to visit.
1108
+ round_limit (int): The round limit to solve the task (default: 12).
1109
 
1110
  Returns:
1111
  str: The simulation result to the task.
1112
  """
1113
+
 
 
1114
  self._reset()
1115
  task_completed = False
 
1116
  detailed_plan = self._task_planning(task_prompt, start_url)
1117
  logger.debug(f"Detailed plan: {detailed_plan}")
1118
 
1119
  self.browser.init()
1120
  self.browser.visit_page(start_url)
1121
 
1122
+ for i in range(round_limit):
1123
  observation, reasoning, action_code = self._observe(task_prompt, detailed_plan)
1124
  logger.debug(f"Observation: {observation}")
1125
  logger.debug(f"Reasoning: {reasoning}")
1126
  logger.debug(f"Action code: {action_code}")
 
1127
 
1128
  if "stop" in action_code:
1129
  task_completed = True