Spaces:

rote1
/

IAGO

Sleeping

App Files Files Community

Sun Tao commited on Mar 9

Commit

fbacfe2

1 Parent(s): a093f82

Update web_toolkit.py

Browse files

Files changed (1) hide show

owl/camel/toolkits/web_toolkit.py +50 -35

owl/camel/toolkits/web_toolkit.py CHANGED Viewed

@@ -5,7 +5,6 @@ from playwright._impl._errors import TimeoutError
 from loguru import logger
 from typing import Any, Dict, List, TypedDict, Union, BinaryIO
 from PIL import Image, ImageDraw, ImageFont
-from firecrawl import FirecrawlApp
 from html2text import html2text
 from retry import retry
 from copy import deepcopy
@@ -16,6 +15,7 @@ from camel.messages import BaseMessage
 from camel.agents import ChatAgent
 from camel.models import ModelFactory, BaseModelBackend
 from camel.types import ModelType, ModelPlatformType
 import io
 import random
@@ -49,7 +49,8 @@ AVAILABLE_ACTIONS_PROMPT = """
 ACTION_WITH_FEEDBACK_LIST = [
     'ask_question_about_video',
-    'download_file_id'
 ]
@@ -127,18 +128,19 @@ def _parse_json_output(text: str) -> Dict[str, Any]:
             fixed_text = re.sub(r'`([^`]*)`', r'"\1"', text)
             return json.loads(fixed_text)
         except json.JSONDecodeError:
             result = {}
             try:
                 bool_pattern = r'"(\w+)"\s*:\s*(true|false)'
                 for match in re.finditer(bool_pattern, text, re.IGNORECASE):
                     key, value = match.groups()
                     result[key] = value.lower() == "true"
                 str_pattern = r'"(\w+)"\s*:\s*"([^"]*)"'
                 for match in re.finditer(str_pattern, text):
                     key, value = match.groups()
                     result[key] = value
                 num_pattern = r'"(\w+)"\s*:\s*(-?\d+(?:\.\d+)?)'
                 for match in re.finditer(num_pattern, text):
                     key, value = match.groups()
@@ -221,7 +223,6 @@ def add_set_of_mark(
     if isinstance(screenshot, bytes):
         screenshot = io.BytesIO(screenshot)
-    # TODO: Not sure why this cast was needed, but by this point screenshot is a binary file-like object
     image = Image.open(cast(BinaryIO, screenshot))
     comp, visible_rects, rects_above, rects_below = _add_set_of_mark(image, ROIs)
     image.close()
@@ -300,8 +301,7 @@ def _color(identifier: int) -> Tuple[int, int, int, int]:
 class BaseBrowser:
     def __init__(self,
                  headless=True,
-                 cache_dir: Optional[str] = None,
-                 page_script_path: Optional[str] = None):
         r"""Initialize the WebBrowserToolkit instance.
         Args:
@@ -326,25 +326,23 @@ class BaseBrowser:
         self.page_history = []           # stores the history of visited pages
         # set the cache directory
         self.cache_dir = "tmp/"
         os.makedirs(self.cache_dir, exist_ok=True)
         if cache_dir is not None:
             self.cache_dir = cache_dir
         # load the page script
-        if page_script_path is None:
-            abs_dir_path = os.path.dirname(os.path.abspath(__file__))
-            page_script_path = os.path.join(abs_dir_path, "page_script.js")
         try:
             with open(page_script_path, "r", encoding='utf-8') as f:
                 self.page_script = f.read()
             f.close()
         except FileNotFoundError:
-            logger.warning(f"Page script file not found: {page_script_path}")
     def init(self):
         r"""Initialize the browser."""
@@ -367,6 +365,7 @@ class BaseBrowser:
         # self.page.wait_for_load_state("networkidle", timeout=timeout_ms)
         # self.page.wait_for_load_state("domcontentloaded", timeout=timeout_ms)
         time.sleep(2)
     def click_blank_area(self):
         r"""Click a blank area of the page to unfocus the current element."""
@@ -531,7 +530,7 @@ class BaseBrowser:
                 comp.save(f, "PNG")
             f.close()
-        return comp, file_path
     def scroll_up(self) -> None:
@@ -551,13 +550,11 @@ class BaseBrowser:
             identifier = str(identifier)
         target = self.page.locator(f"[__elementId='{identifier}']")
-        # See if it exists
         try:
            target.wait_for(timeout=5000)
         except TimeoutError:
             raise ValueError("No such element.") from None
-        # Click it
         target.scroll_into_view_if_needed()
         new_page = None
@@ -571,15 +568,15 @@ class BaseBrowser:
                 if new_page:
                     self.page_history.append(deepcopy(self.page.url))
                     self.page = new_page
         except PlaywrightError:
             pass
-        # new_page = self
         self._wait_for_load()
     def extract_url_content(self):
         r"""Extract the content of the current page."""
-        # TODO: update it using firecrawl
         content = self.page.content()
         return content
@@ -757,7 +754,6 @@ class BaseBrowser:
             }
         """)
     @retry(requests.RequestException)
     def get_webpage_content(self) -> str:
         self._wait_for_load()
@@ -767,22 +763,32 @@ class BaseBrowser:
         return markdown_content
 class WebToolkit(BaseToolkit):
     def __init__(self,
-                 headless=True,
                  cache_dir: Optional[str] = None,
-                 page_script_path: Optional[str] = None,
                  history_window: int = 5,
                  web_agent_model: Optional[BaseModelBackend] = None,
                  planning_agent_model: Optional[BaseModelBackend] = None,
-                 output_language: str = "en"
                  ):
         self.browser = BaseBrowser(
             headless=headless,
-            cache_dir=cache_dir,
-            page_script_path=page_script_path
             )
         self.history_window = history_window
@@ -791,7 +797,6 @@ class WebToolkit(BaseToolkit):
         self.output_language = output_language
         self.history = []
-        # self.search_toolkit = SearchToolkit()
         self.web_agent, self.planning_agent = self._initialize_agent()
@@ -915,6 +920,19 @@ Here are some tips for you:
         observation_result: str = resp_dict.get("observation", "")
         reasoning_result: str = resp_dict.get("reasoning", "")
         action_code: str = resp_dict.get("action_code", "")
         action_code = action_code.replace("`", "").strip()
         return observation_result, reasoning_result, action_code
@@ -961,8 +979,6 @@ Here are some tips for you:
         r"""Get the final answer based on the task prompt and current browser state.
         It is used when the agent thinks that the task can be completed without any further action, and answer can be directly found in the current viewport.
         """
-        # screenshot, _ = self.browser.get_screenshot()
-        # img = _reload_image(screenshot)
         prompt = f"""
 We are solving a complex web task which needs multi-step browser interaction. After the multi-step observation, reasoning and acting with web browser, we think that the task is currently solved.
@@ -1078,37 +1094,36 @@ Your output should be in json format, including the following fields:
             return False, replanned_schema
     def browser_simulation(self,
                            task_prompt: str,
                            start_url: str,
                            ) -> str:
         r"""A powerful toolkit which can simulate the browser interaction to solve the task which needs multi-step actions.
         Args:
             task_prompt (str): The task prompt to solve.
             start_url (str): The start URL to visit.
         Returns:
             str: The simulation result to the task.
         """
-        ROUND_LIMIT = 12
         self._reset()
         task_completed = False
         detailed_plan = self._task_planning(task_prompt, start_url)
         logger.debug(f"Detailed plan: {detailed_plan}")
         self.browser.init()
         self.browser.visit_page(start_url)
-        for i in range(ROUND_LIMIT):
             observation, reasoning, action_code = self._observe(task_prompt, detailed_plan)
             logger.debug(f"Observation: {observation}")
             logger.debug(f"Reasoning: {reasoning}")
             logger.debug(f"Action code: {action_code}")
-            # breakpoint()
             if "stop" in action_code:
                 task_completed = True

 from loguru import logger
 from typing import Any, Dict, List, TypedDict, Union, BinaryIO
 from PIL import Image, ImageDraw, ImageFont
 from html2text import html2text
 from retry import retry
 from copy import deepcopy
 from camel.agents import ChatAgent
 from camel.models import ModelFactory, BaseModelBackend
 from camel.types import ModelType, ModelPlatformType
+from camel.utils import dependencies_required
 import io
 import random
 ACTION_WITH_FEEDBACK_LIST = [
     'ask_question_about_video',
+    'download_file_id',
+    'find_text_on_page',
 ]
             fixed_text = re.sub(r'`([^`]*)`', r'"\1"', text)
             return json.loads(fixed_text)
         except json.JSONDecodeError:
+            # Try to extract key fields
             result = {}
             try:
                 bool_pattern = r'"(\w+)"\s*:\s*(true|false)'
                 for match in re.finditer(bool_pattern, text, re.IGNORECASE):
                     key, value = match.groups()
                     result[key] = value.lower() == "true"
                 str_pattern = r'"(\w+)"\s*:\s*"([^"]*)"'
                 for match in re.finditer(str_pattern, text):
                     key, value = match.groups()
                     result[key] = value
                 num_pattern = r'"(\w+)"\s*:\s*(-?\d+(?:\.\d+)?)'
                 for match in re.finditer(num_pattern, text):
                     key, value = match.groups()
     if isinstance(screenshot, bytes):
         screenshot = io.BytesIO(screenshot)
     image = Image.open(cast(BinaryIO, screenshot))
     comp, visible_rects, rects_above, rects_below = _add_set_of_mark(image, ROIs)
     image.close()
 class BaseBrowser:
     def __init__(self,
                  headless=True,
+                 cache_dir: Optional[str] = None):
         r"""Initialize the WebBrowserToolkit instance.
         Args:
         self.page_history = []           # stores the history of visited pages
         # set the cache directory
         self.cache_dir = "tmp/"
         os.makedirs(self.cache_dir, exist_ok=True)
         if cache_dir is not None:
             self.cache_dir = cache_dir
         # load the page script
+        abs_dir_path = os.path.dirname(os.path.abspath(__file__))
+        page_script_path = os.path.join(abs_dir_path, "page_script.js")
         try:
             with open(page_script_path, "r", encoding='utf-8') as f:
                 self.page_script = f.read()
             f.close()
         except FileNotFoundError:
+            raise FileNotFoundError(f"Page script file not found at path: {page_script_path}")
     def init(self):
         r"""Initialize the browser."""
         # self.page.wait_for_load_state("networkidle", timeout=timeout_ms)
         # self.page.wait_for_load_state("domcontentloaded", timeout=timeout_ms)
         time.sleep(2)
     def click_blank_area(self):
         r"""Click a blank area of the page to unfocus the current element."""
                 comp.save(f, "PNG")
             f.close()
+        return comp, file_path
     def scroll_up(self) -> None:
             identifier = str(identifier)
         target = self.page.locator(f"[__elementId='{identifier}']")
         try:
            target.wait_for(timeout=5000)
         except TimeoutError:
             raise ValueError("No such element.") from None
         target.scroll_into_view_if_needed()
         new_page = None
                 if new_page:
                     self.page_history.append(deepcopy(self.page.url))
                     self.page = new_page
         except PlaywrightError:
             pass
         self._wait_for_load()
     def extract_url_content(self):
         r"""Extract the content of the current page."""
         content = self.page.content()
         return content
             }
         """)
     @retry(requests.RequestException)
     def get_webpage_content(self) -> str:
         self._wait_for_load()
         return markdown_content
 class WebToolkit(BaseToolkit):
+    r"""A class for browsing the web and interacting with web pages.
+    This class provides methods for browsing the web and interacting with web pages.
+    """
     def __init__(self,
+                 headless: bool = True,
                  cache_dir: Optional[str] = None,
                  history_window: int = 5,
                  web_agent_model: Optional[BaseModelBackend] = None,
                  planning_agent_model: Optional[BaseModelBackend] = None,
+                 output_language: str = "en",
                  ):
+        r"""Initialize the WebToolkit instance.
+        Args:
+            headless (bool): Whether to run the browser in headless mode.
+            cache_dir (Union[str, None]): The directory to store cache files.
+            history_window (int): The window size for storing the history of actions.
+            web_agent_model (Optional[BaseModelBackend]): The model backend for the web agent.
+            planning_agent_model (Optional[BaseModelBackend]): The model backend for the planning agent.
+        """
         self.browser = BaseBrowser(
             headless=headless,
+            cache_dir=cache_dir
             )
         self.history_window = history_window
         self.output_language = output_language
         self.history = []
         self.web_agent, self.planning_agent = self._initialize_agent()
         observation_result: str = resp_dict.get("observation", "")
         reasoning_result: str = resp_dict.get("reasoning", "")
         action_code: str = resp_dict.get("action_code", "")
+        if action_code and "(" in action_code and ")" not in action_code:
+            action_match = re.search(r'"action_code"\s*:\s*[`"]([^`"]*\([^)]*\))[`"]', resp_content)
+            if action_match:
+                action_code = action_match.group(1)
+            else:
+                logger.warning(f"Incomplete action_code detected: {action_code}")
+                if action_code.startswith("fill_input_id("):
+                    parts = action_code.split(",", 1)
+                    if len(parts) > 1:
+                        id_part = parts[0].replace("fill_input_id(", "").strip()
+                        action_code = f'fill_input_id({id_part}, "Please fill the text here.")'
         action_code = action_code.replace("`", "").strip()
         return observation_result, reasoning_result, action_code
         r"""Get the final answer based on the task prompt and current browser state.
         It is used when the agent thinks that the task can be completed without any further action, and answer can be directly found in the current viewport.
         """
         prompt = f"""
 We are solving a complex web task which needs multi-step browser interaction. After the multi-step observation, reasoning and acting with web browser, we think that the task is currently solved.
             return False, replanned_schema
+    @dependencies_required("playwright")
     def browser_simulation(self,
                            task_prompt: str,
                            start_url: str,
+                           round_limit: int = 12
                            ) -> str:
         r"""A powerful toolkit which can simulate the browser interaction to solve the task which needs multi-step actions.
         Args:
             task_prompt (str): The task prompt to solve.
             start_url (str): The start URL to visit.
+            round_limit (int): The round limit to solve the task (default: 12).
         Returns:
             str: The simulation result to the task.
         """
         self._reset()
         task_completed = False
         detailed_plan = self._task_planning(task_prompt, start_url)
         logger.debug(f"Detailed plan: {detailed_plan}")
         self.browser.init()
         self.browser.visit_page(start_url)
+        for i in range(round_limit):
             observation, reasoning, action_code = self._observe(task_prompt, detailed_plan)
             logger.debug(f"Observation: {observation}")
             logger.debug(f"Reasoning: {reasoning}")
             logger.debug(f"Action code: {action_code}")
             if "stop" in action_code:
                 task_completed = True