Spaces:

rote1
/

IAGO

Sleeping

App Files Files Community

Yuhang Zhou commited on Mar 9

Commit

b2b3b73

2 Parent(s): 9d6a9c8 fbacfe2

Update web_toolkit.py and add run_qwen_mini_zh.py (#105)

Browse files

Files changed (2) hide show

owl/camel/toolkits/web_toolkit.py +105 -48
owl/run_qwen_mini_zh.py +96 -0

owl/camel/toolkits/web_toolkit.py CHANGED Viewed

@@ -5,7 +5,6 @@ from playwright._impl._errors import TimeoutError
 from loguru import logger
 from typing import Any, Dict, List, TypedDict, Union, BinaryIO
 from PIL import Image, ImageDraw, ImageFont
-from firecrawl import FirecrawlApp
 from html2text import html2text
 from retry import retry
 from copy import deepcopy
@@ -16,6 +15,7 @@ from camel.messages import BaseMessage
 from camel.agents import ChatAgent
 from camel.models import ModelFactory, BaseModelBackend
 from camel.types import ModelType, ModelPlatformType
 import io
 import random
@@ -49,7 +49,8 @@ AVAILABLE_ACTIONS_PROMPT = """
 ACTION_WITH_FEEDBACK_LIST = [
     'ask_question_about_video',
-    'download_file_id'
 ]
@@ -106,26 +107,61 @@ def _get_bool(d: Any, k: str) -> bool:
 def _parse_json_output(text: str) -> Dict[str, Any]:
-    # judge if text is markdown format (```json ````)
-    if "```json" in text and "```" in text:
-        # Extract content between ```json and the last ```
-        start_idx = text.find("```json") + len("```json")
-        end_idx = text.rfind("```")
-        if start_idx > -1 and end_idx > start_idx:
-            text = text[start_idx:end_idx].strip()
     try:
         return json.loads(text)
     except json.JSONDecodeError:
-        logger.warning(f"Failed to parse JSON output: {text}")
-        # Try to clean the text further and attempt parsing again
         try:
-            # Remove any extra whitespace or control characters
-            cleaned_text = text.strip()
-            return json.loads(cleaned_text)
         except json.JSONDecodeError:
-            logger.error(f"Failed to parse JSON even after cleaning: {text}")
-            return {}
 def _reload_image(image: Image.Image):
@@ -187,7 +223,6 @@ def add_set_of_mark(
     if isinstance(screenshot, bytes):
         screenshot = io.BytesIO(screenshot)
-    # TODO: Not sure why this cast was needed, but by this point screenshot is a binary file-like object
     image = Image.open(cast(BinaryIO, screenshot))
     comp, visible_rects, rects_above, rects_below = _add_set_of_mark(image, ROIs)
     image.close()
@@ -266,8 +301,7 @@ def _color(identifier: int) -> Tuple[int, int, int, int]:
 class BaseBrowser:
     def __init__(self,
                  headless=True,
-                 cache_dir: Optional[str] = None,
-                 page_script_path: Optional[str] = None):
         r"""Initialize the WebBrowserToolkit instance.
         Args:
@@ -292,25 +326,23 @@ class BaseBrowser:
         self.page_history = []           # stores the history of visited pages
         # set the cache directory
         self.cache_dir = "tmp/"
         os.makedirs(self.cache_dir, exist_ok=True)
         if cache_dir is not None:
             self.cache_dir = cache_dir
         # load the page script
-        if page_script_path is None:
-            abs_dir_path = os.path.dirname(os.path.abspath(__file__))
-            page_script_path = os.path.join(abs_dir_path, "page_script.js")
         try:
             with open(page_script_path, "r", encoding='utf-8') as f:
                 self.page_script = f.read()
             f.close()
         except FileNotFoundError:
-            logger.warning(f"Page script file not found: {page_script_path}")
     def init(self):
         r"""Initialize the browser."""
@@ -333,6 +365,7 @@ class BaseBrowser:
         # self.page.wait_for_load_state("networkidle", timeout=timeout_ms)
         # self.page.wait_for_load_state("domcontentloaded", timeout=timeout_ms)
         time.sleep(2)
     def click_blank_area(self):
         r"""Click a blank area of the page to unfocus the current element."""
@@ -379,7 +412,9 @@ class BaseBrowser:
         file_path = None
         if save_image:
             # get url name to form a file name
-            url_name = self.page_url.split("/")[-1].replace(".", "_").replace(":", "_")
             # get formatted time: mmddhhmmss
             timestamp = datetime.datetime.now().strftime("%m%d%H%M%S")
@@ -486,7 +521,9 @@ class BaseBrowser:
         file_path = None
         comp, visible_rects, rects_above, rects_below = add_set_of_mark(screenshot, rects)
         if save_image:
-            url_name = self.page_url.split("/")[-1].replace(".", "_").replace(":", "_")
             timestamp = datetime.datetime.now().strftime("%m%d%H%M%S")
             file_path = os.path.join(self.cache_dir, f"{url_name}_{timestamp}.png")
             with open(file_path, "wb") as f:
@@ -513,13 +550,11 @@ class BaseBrowser:
             identifier = str(identifier)
         target = self.page.locator(f"[__elementId='{identifier}']")
-        # See if it exists
         try:
            target.wait_for(timeout=5000)
         except TimeoutError:
             raise ValueError("No such element.") from None
-        # Click it
         target.scroll_into_view_if_needed()
         new_page = None
@@ -533,15 +568,15 @@ class BaseBrowser:
                 if new_page:
                     self.page_history.append(deepcopy(self.page.url))
                     self.page = new_page
         except PlaywrightError:
             pass
-        # new_page = self
         self._wait_for_load()
     def extract_url_content(self):
         r"""Extract the content of the current page."""
-        # TODO: update it using firecrawl
         content = self.page.content()
         return content
@@ -719,7 +754,6 @@ class BaseBrowser:
             }
         """)
     @retry(requests.RequestException)
     def get_webpage_content(self) -> str:
         self._wait_for_load()
@@ -729,29 +763,40 @@ class BaseBrowser:
         return markdown_content
 class WebToolkit(BaseToolkit):
     def __init__(self,
-                 headless=True,
                  cache_dir: Optional[str] = None,
-                 page_script_path: Optional[str] = None,
                  history_window: int = 5,
                  web_agent_model: Optional[BaseModelBackend] = None,
                  planning_agent_model: Optional[BaseModelBackend] = None,
                  ):
         self.browser = BaseBrowser(
             headless=headless,
-            cache_dir=cache_dir,
-            page_script_path=page_script_path
             )
         self.history_window = history_window
         self.web_agent_model = web_agent_model
         self.planning_agent_model = planning_agent_model
         self.history = []
-        # self.search_toolkit = SearchToolkit()
         self.web_agent, self.planning_agent = self._initialize_agent()
@@ -789,6 +834,7 @@ Given a high-level task, you can leverage predefined browser tools to help users
         web_agent = ChatAgent(
             system_message=system_prompt,
             model=web_agent_model,
             )
         planning_system_prompt = """
@@ -797,7 +843,8 @@ You are a helpful planning agent that can assist users in planning complex tasks
         planning_agent = ChatAgent(
             system_message=planning_system_prompt,
-            model=planning_model
         )
         return web_agent, planning_agent
@@ -873,6 +920,19 @@ Here are some tips for you:
         observation_result: str = resp_dict.get("observation", "")
         reasoning_result: str = resp_dict.get("reasoning", "")
         action_code: str = resp_dict.get("action_code", "")
         action_code = action_code.replace("`", "").strip()
         return observation_result, reasoning_result, action_code
@@ -919,8 +979,6 @@ Here are some tips for you:
         r"""Get the final answer based on the task prompt and current browser state.
         It is used when the agent thinks that the task can be completed without any further action, and answer can be directly found in the current viewport.
         """
-        # screenshot, _ = self.browser.get_screenshot()
-        # img = _reload_image(screenshot)
         prompt = f"""
 We are solving a complex web task which needs multi-step browser interaction. After the multi-step observation, reasoning and acting with web browser, we think that the task is currently solved.
@@ -1036,37 +1094,36 @@ Your output should be in json format, including the following fields:
             return False, replanned_schema
     def browser_simulation(self,
                            task_prompt: str,
                            start_url: str,
                            ) -> str:
         r"""A powerful toolkit which can simulate the browser interaction to solve the task which needs multi-step actions.
         Args:
             task_prompt (str): The task prompt to solve.
             start_url (str): The start URL to visit.
         Returns:
             str: The simulation result to the task.
         """
-        ROUND_LIMIT = 12
         self._reset()
         task_completed = False
         detailed_plan = self._task_planning(task_prompt, start_url)
         logger.debug(f"Detailed plan: {detailed_plan}")
         self.browser.init()
         self.browser.visit_page(start_url)
-        for i in range(ROUND_LIMIT):
             observation, reasoning, action_code = self._observe(task_prompt, detailed_plan)
             logger.debug(f"Observation: {observation}")
             logger.debug(f"Reasoning: {reasoning}")
             logger.debug(f"Action code: {action_code}")
-            # breakpoint()
             if "stop" in action_code:
                 task_completed = True

 from loguru import logger
 from typing import Any, Dict, List, TypedDict, Union, BinaryIO
 from PIL import Image, ImageDraw, ImageFont
 from html2text import html2text
 from retry import retry
 from copy import deepcopy
 from camel.agents import ChatAgent
 from camel.models import ModelFactory, BaseModelBackend
 from camel.types import ModelType, ModelPlatformType
+from camel.utils import dependencies_required
 import io
 import random
 ACTION_WITH_FEEDBACK_LIST = [
     'ask_question_about_video',
+    'download_file_id',
+    'find_text_on_page',
 ]
 def _parse_json_output(text: str) -> Dict[str, Any]:
+    """Extract JSON output from a string."""
+    markdown_pattern = r'```(?:json)?\s*(.*?)\s*```'
+    markdown_match = re.search(markdown_pattern, text, re.DOTALL)
+    if markdown_match:
+        text = markdown_match.group(1).strip()
+    triple_quotes_pattern = r'"""(?:json)?\s*(.*?)\s*"""'
+    triple_quotes_match = re.search(triple_quotes_pattern, text, re.DOTALL)
+    if triple_quotes_match:
+        text = triple_quotes_match.group(1).strip()
+    text = text.replace("`", '"')
     try:
         return json.loads(text)
     except json.JSONDecodeError:
         try:
+            fixed_text = re.sub(r'`([^`]*)`', r'"\1"', text)
+            return json.loads(fixed_text)
         except json.JSONDecodeError:
+            # Try to extract key fields
+            result = {}
+            try:
+                bool_pattern = r'"(\w+)"\s*:\s*(true|false)'
+                for match in re.finditer(bool_pattern, text, re.IGNORECASE):
+                    key, value = match.groups()
+                    result[key] = value.lower() == "true"
+                str_pattern = r'"(\w+)"\s*:\s*"([^"]*)"'
+                for match in re.finditer(str_pattern, text):
+                    key, value = match.groups()
+                    result[key] = value
+                num_pattern = r'"(\w+)"\s*:\s*(-?\d+(?:\.\d+)?)'
+                for match in re.finditer(num_pattern, text):
+                    key, value = match.groups()
+                    try:
+                        result[key] = int(value)
+                    except ValueError:
+                        result[key] = float(value)
+                empty_str_pattern = r'"(\w+)"\s*:\s*""'
+                for match in re.finditer(empty_str_pattern, text):
+                    key = match.group(1)
+                    result[key] = ""
+                if result:
+                    return result
+                logger.warning(f"Failed to parse JSON output: {text}")
+                return {}
+            except Exception as e:
+                logger.warning(f"Error while extracting fields from JSON: {e}")
+                return {}
 def _reload_image(image: Image.Image):
     if isinstance(screenshot, bytes):
         screenshot = io.BytesIO(screenshot)
     image = Image.open(cast(BinaryIO, screenshot))
     comp, visible_rects, rects_above, rects_below = _add_set_of_mark(image, ROIs)
     image.close()
 class BaseBrowser:
     def __init__(self,
                  headless=True,
+                 cache_dir: Optional[str] = None):
         r"""Initialize the WebBrowserToolkit instance.
         Args:
         self.page_history = []           # stores the history of visited pages
         # set the cache directory
         self.cache_dir = "tmp/"
         os.makedirs(self.cache_dir, exist_ok=True)
         if cache_dir is not None:
             self.cache_dir = cache_dir
         # load the page script
+        abs_dir_path = os.path.dirname(os.path.abspath(__file__))
+        page_script_path = os.path.join(abs_dir_path, "page_script.js")
         try:
             with open(page_script_path, "r", encoding='utf-8') as f:
                 self.page_script = f.read()
             f.close()
         except FileNotFoundError:
+            raise FileNotFoundError(f"Page script file not found at path: {page_script_path}")
     def init(self):
         r"""Initialize the browser."""
         # self.page.wait_for_load_state("networkidle", timeout=timeout_ms)
         # self.page.wait_for_load_state("domcontentloaded", timeout=timeout_ms)
         time.sleep(2)
     def click_blank_area(self):
         r"""Click a blank area of the page to unfocus the current element."""
         file_path = None
         if save_image:
             # get url name to form a file name
+            url_name = self.page_url.split("/")[-1]
+            for char in ['\\', '/', ':', '*', '?', '"', '<', '>', '|', '.']:
+                url_name = url_name.replace(char, "_")
             # get formatted time: mmddhhmmss
             timestamp = datetime.datetime.now().strftime("%m%d%H%M%S")
         file_path = None
         comp, visible_rects, rects_above, rects_below = add_set_of_mark(screenshot, rects)
         if save_image:
+            url_name = self.page_url.split("/")[-1]
+            for char in ['\\', '/', ':', '*', '?', '"', '<', '>', '|', '.']:
+                url_name = url_name.replace(char, "_")
             timestamp = datetime.datetime.now().strftime("%m%d%H%M%S")
             file_path = os.path.join(self.cache_dir, f"{url_name}_{timestamp}.png")
             with open(file_path, "wb") as f:
             identifier = str(identifier)
         target = self.page.locator(f"[__elementId='{identifier}']")
         try:
            target.wait_for(timeout=5000)
         except TimeoutError:
             raise ValueError("No such element.") from None
         target.scroll_into_view_if_needed()
         new_page = None
                 if new_page:
                     self.page_history.append(deepcopy(self.page.url))
                     self.page = new_page
         except PlaywrightError:
             pass
         self._wait_for_load()
     def extract_url_content(self):
         r"""Extract the content of the current page."""
         content = self.page.content()
         return content
             }
         """)
     @retry(requests.RequestException)
     def get_webpage_content(self) -> str:
         self._wait_for_load()
         return markdown_content
 class WebToolkit(BaseToolkit):
+    r"""A class for browsing the web and interacting with web pages.
+    This class provides methods for browsing the web and interacting with web pages.
+    """
     def __init__(self,
+                 headless: bool = True,
                  cache_dir: Optional[str] = None,
                  history_window: int = 5,
                  web_agent_model: Optional[BaseModelBackend] = None,
                  planning_agent_model: Optional[BaseModelBackend] = None,
+                 output_language: str = "en",
                  ):
+        r"""Initialize the WebToolkit instance.
+        Args:
+            headless (bool): Whether to run the browser in headless mode.
+            cache_dir (Union[str, None]): The directory to store cache files.
+            history_window (int): The window size for storing the history of actions.
+            web_agent_model (Optional[BaseModelBackend]): The model backend for the web agent.
+            planning_agent_model (Optional[BaseModelBackend]): The model backend for the planning agent.
+        """
         self.browser = BaseBrowser(
             headless=headless,
+            cache_dir=cache_dir
             )
         self.history_window = history_window
         self.web_agent_model = web_agent_model
         self.planning_agent_model = planning_agent_model
+        self.output_language = output_language
         self.history = []
         self.web_agent, self.planning_agent = self._initialize_agent()
         web_agent = ChatAgent(
             system_message=system_prompt,
             model=web_agent_model,
+            output_language=self.output_language
             )
         planning_system_prompt = """
         planning_agent = ChatAgent(
             system_message=planning_system_prompt,
+            model=planning_model,
+            output_language=self.output_language
         )
         return web_agent, planning_agent
         observation_result: str = resp_dict.get("observation", "")
         reasoning_result: str = resp_dict.get("reasoning", "")
         action_code: str = resp_dict.get("action_code", "")
+        if action_code and "(" in action_code and ")" not in action_code:
+            action_match = re.search(r'"action_code"\s*:\s*[`"]([^`"]*\([^)]*\))[`"]', resp_content)
+            if action_match:
+                action_code = action_match.group(1)
+            else:
+                logger.warning(f"Incomplete action_code detected: {action_code}")
+                if action_code.startswith("fill_input_id("):
+                    parts = action_code.split(",", 1)
+                    if len(parts) > 1:
+                        id_part = parts[0].replace("fill_input_id(", "").strip()
+                        action_code = f'fill_input_id({id_part}, "Please fill the text here.")'
         action_code = action_code.replace("`", "").strip()
         return observation_result, reasoning_result, action_code
         r"""Get the final answer based on the task prompt and current browser state.
         It is used when the agent thinks that the task can be completed without any further action, and answer can be directly found in the current viewport.
         """
         prompt = f"""
 We are solving a complex web task which needs multi-step browser interaction. After the multi-step observation, reasoning and acting with web browser, we think that the task is currently solved.
             return False, replanned_schema
+    @dependencies_required("playwright")
     def browser_simulation(self,
                            task_prompt: str,
                            start_url: str,
+                           round_limit: int = 12
                            ) -> str:
         r"""A powerful toolkit which can simulate the browser interaction to solve the task which needs multi-step actions.
         Args:
             task_prompt (str): The task prompt to solve.
             start_url (str): The start URL to visit.
+            round_limit (int): The round limit to solve the task (default: 12).
         Returns:
             str: The simulation result to the task.
         """
         self._reset()
         task_completed = False
         detailed_plan = self._task_planning(task_prompt, start_url)
         logger.debug(f"Detailed plan: {detailed_plan}")
         self.browser.init()
         self.browser.visit_page(start_url)
+        for i in range(round_limit):
             observation, reasoning, action_code = self._observe(task_prompt, detailed_plan)
             logger.debug(f"Observation: {observation}")
             logger.debug(f"Reasoning: {reasoning}")
             logger.debug(f"Action code: {action_code}")
             if "stop" in action_code:
                 task_completed = True

owl/run_qwen_mini_zh.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from dotenv import load_dotenv
+load_dotenv()
+from camel.models import ModelFactory
+from camel.toolkits import WebToolkit,SearchToolkit,FunctionTool
+from camel.types import ModelPlatformType,ModelType
+from loguru import logger
+from utils import OwlRolePlaying, run_society
+import os
+model_scope_api_key = os.getenv("MODELSCOPE_API_KEY")
+def construct_society(question: str) -> OwlRolePlaying:
+    r"""Construct the society based on the question."""
+    user_role_name = "user"
+    assistant_role_name = "assistant"
+    user_model = ModelFactory.create(
+             model_platform=ModelPlatformType.QWEN,
+             model_type="qwen-max",
+             model_config_dict={"temperature": 0},
+         )
+    assistant_model = ModelFactory.create(
+             model_platform=ModelPlatformType.QWEN,
+             model_type="qwen-max",
+             model_config_dict={"temperature": 0},
+         )
+    search_model = ModelFactory.create(
+             model_platform=ModelPlatformType.QWEN,
+             model_type="qwen-max",
+             model_config_dict={"temperature": 0},
+         )
+    planning_model = ModelFactory.create(
+             model_platform=ModelPlatformType.QWEN,
+             model_type="qwen-max",
+             model_config_dict={"temperature": 0},
+         )
+    web_model = ModelFactory.create(
+             model_platform=ModelPlatformType.QWEN,
+             model_type="qwen-vl-plus-latest",
+             model_config_dict={"temperature": 0},
+         )
+    tools_list = [
+        *WebToolkit(
+            headless=False,
+            web_agent_model=web_model,
+            planning_agent_model=planning_model,
+            output_language='中文'
+        ).get_tools(),
+        FunctionTool(SearchToolkit(model=search_model).search_duckduckgo),
+    ]
+    user_role_name = 'user'
+    user_agent_kwargs = dict(model=user_model)
+    assistant_role_name = 'assistant'
+    assistant_agent_kwargs = dict(model=assistant_model,
+    tools=tools_list)
+    task_kwargs = {
+        'task_prompt': question,
+        'with_task_specify': False,
+        'output_language': '中文',
+    }
+    society = OwlRolePlaying(
+        **task_kwargs,
+        user_role_name=user_role_name,
+        user_agent_kwargs=user_agent_kwargs,
+        assistant_role_name=assistant_role_name,
+        assistant_agent_kwargs=assistant_agent_kwargs,
+    )
+    return society
+# Example case
+question = "打开小红书上浏览推荐栏目下的前三个笔记内容，不要登陆，之后给我一个总结报告"
+society = construct_society(question)
+answer, chat_history, token_count = run_society(society)
+logger.success(f"Answer: {answer}")