Sun Tao
commited on
Commit
·
fbacfe2
1
Parent(s):
a093f82
Update web_toolkit.py
Browse files
owl/camel/toolkits/web_toolkit.py
CHANGED
@@ -5,7 +5,6 @@ from playwright._impl._errors import TimeoutError
|
|
5 |
from loguru import logger
|
6 |
from typing import Any, Dict, List, TypedDict, Union, BinaryIO
|
7 |
from PIL import Image, ImageDraw, ImageFont
|
8 |
-
from firecrawl import FirecrawlApp
|
9 |
from html2text import html2text
|
10 |
from retry import retry
|
11 |
from copy import deepcopy
|
@@ -16,6 +15,7 @@ from camel.messages import BaseMessage
|
|
16 |
from camel.agents import ChatAgent
|
17 |
from camel.models import ModelFactory, BaseModelBackend
|
18 |
from camel.types import ModelType, ModelPlatformType
|
|
|
19 |
|
20 |
import io
|
21 |
import random
|
@@ -49,7 +49,8 @@ AVAILABLE_ACTIONS_PROMPT = """
|
|
49 |
|
50 |
ACTION_WITH_FEEDBACK_LIST = [
|
51 |
'ask_question_about_video',
|
52 |
-
'download_file_id'
|
|
|
53 |
]
|
54 |
|
55 |
|
@@ -127,18 +128,19 @@ def _parse_json_output(text: str) -> Dict[str, Any]:
|
|
127 |
fixed_text = re.sub(r'`([^`]*)`', r'"\1"', text)
|
128 |
return json.loads(fixed_text)
|
129 |
except json.JSONDecodeError:
|
|
|
130 |
result = {}
|
131 |
try:
|
132 |
bool_pattern = r'"(\w+)"\s*:\s*(true|false)'
|
133 |
for match in re.finditer(bool_pattern, text, re.IGNORECASE):
|
134 |
key, value = match.groups()
|
135 |
result[key] = value.lower() == "true"
|
136 |
-
|
137 |
str_pattern = r'"(\w+)"\s*:\s*"([^"]*)"'
|
138 |
for match in re.finditer(str_pattern, text):
|
139 |
key, value = match.groups()
|
140 |
result[key] = value
|
141 |
-
|
142 |
num_pattern = r'"(\w+)"\s*:\s*(-?\d+(?:\.\d+)?)'
|
143 |
for match in re.finditer(num_pattern, text):
|
144 |
key, value = match.groups()
|
@@ -221,7 +223,6 @@ def add_set_of_mark(
|
|
221 |
if isinstance(screenshot, bytes):
|
222 |
screenshot = io.BytesIO(screenshot)
|
223 |
|
224 |
-
# TODO: Not sure why this cast was needed, but by this point screenshot is a binary file-like object
|
225 |
image = Image.open(cast(BinaryIO, screenshot))
|
226 |
comp, visible_rects, rects_above, rects_below = _add_set_of_mark(image, ROIs)
|
227 |
image.close()
|
@@ -300,8 +301,7 @@ def _color(identifier: int) -> Tuple[int, int, int, int]:
|
|
300 |
class BaseBrowser:
|
301 |
def __init__(self,
|
302 |
headless=True,
|
303 |
-
cache_dir: Optional[str] = None
|
304 |
-
page_script_path: Optional[str] = None):
|
305 |
r"""Initialize the WebBrowserToolkit instance.
|
306 |
|
307 |
Args:
|
@@ -326,25 +326,23 @@ class BaseBrowser:
|
|
326 |
|
327 |
self.page_history = [] # stores the history of visited pages
|
328 |
|
329 |
-
|
330 |
# set the cache directory
|
331 |
self.cache_dir = "tmp/"
|
332 |
os.makedirs(self.cache_dir, exist_ok=True)
|
333 |
if cache_dir is not None:
|
334 |
self.cache_dir = cache_dir
|
335 |
-
|
336 |
# load the page script
|
337 |
-
|
338 |
-
|
339 |
-
page_script_path = os.path.join(abs_dir_path, "page_script.js")
|
340 |
|
341 |
try:
|
342 |
with open(page_script_path, "r", encoding='utf-8') as f:
|
343 |
self.page_script = f.read()
|
344 |
f.close()
|
345 |
except FileNotFoundError:
|
346 |
-
|
347 |
-
|
348 |
|
349 |
def init(self):
|
350 |
r"""Initialize the browser."""
|
@@ -367,6 +365,7 @@ class BaseBrowser:
|
|
367 |
# self.page.wait_for_load_state("networkidle", timeout=timeout_ms)
|
368 |
# self.page.wait_for_load_state("domcontentloaded", timeout=timeout_ms)
|
369 |
time.sleep(2)
|
|
|
370 |
|
371 |
def click_blank_area(self):
|
372 |
r"""Click a blank area of the page to unfocus the current element."""
|
@@ -531,7 +530,7 @@ class BaseBrowser:
|
|
531 |
comp.save(f, "PNG")
|
532 |
f.close()
|
533 |
|
534 |
-
return comp, file_path
|
535 |
|
536 |
|
537 |
def scroll_up(self) -> None:
|
@@ -551,13 +550,11 @@ class BaseBrowser:
|
|
551 |
identifier = str(identifier)
|
552 |
target = self.page.locator(f"[__elementId='{identifier}']")
|
553 |
|
554 |
-
# See if it exists
|
555 |
try:
|
556 |
target.wait_for(timeout=5000)
|
557 |
except TimeoutError:
|
558 |
raise ValueError("No such element.") from None
|
559 |
|
560 |
-
# Click it
|
561 |
target.scroll_into_view_if_needed()
|
562 |
|
563 |
new_page = None
|
@@ -571,15 +568,15 @@ class BaseBrowser:
|
|
571 |
if new_page:
|
572 |
self.page_history.append(deepcopy(self.page.url))
|
573 |
self.page = new_page
|
|
|
574 |
except PlaywrightError:
|
575 |
pass
|
576 |
-
|
577 |
self._wait_for_load()
|
578 |
|
579 |
|
580 |
def extract_url_content(self):
|
581 |
r"""Extract the content of the current page."""
|
582 |
-
# TODO: update it using firecrawl
|
583 |
content = self.page.content()
|
584 |
return content
|
585 |
|
@@ -757,7 +754,6 @@ class BaseBrowser:
|
|
757 |
}
|
758 |
""")
|
759 |
|
760 |
-
|
761 |
@retry(requests.RequestException)
|
762 |
def get_webpage_content(self) -> str:
|
763 |
self._wait_for_load()
|
@@ -767,22 +763,32 @@ class BaseBrowser:
|
|
767 |
return markdown_content
|
768 |
|
769 |
|
770 |
-
|
771 |
class WebToolkit(BaseToolkit):
|
|
|
|
|
|
|
|
|
772 |
def __init__(self,
|
773 |
-
headless=True,
|
774 |
cache_dir: Optional[str] = None,
|
775 |
-
page_script_path: Optional[str] = None,
|
776 |
history_window: int = 5,
|
777 |
web_agent_model: Optional[BaseModelBackend] = None,
|
778 |
planning_agent_model: Optional[BaseModelBackend] = None,
|
779 |
-
output_language: str = "en"
|
780 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
781 |
|
782 |
self.browser = BaseBrowser(
|
783 |
headless=headless,
|
784 |
-
cache_dir=cache_dir
|
785 |
-
page_script_path=page_script_path
|
786 |
)
|
787 |
|
788 |
self.history_window = history_window
|
@@ -791,7 +797,6 @@ class WebToolkit(BaseToolkit):
|
|
791 |
self.output_language = output_language
|
792 |
|
793 |
self.history = []
|
794 |
-
# self.search_toolkit = SearchToolkit()
|
795 |
self.web_agent, self.planning_agent = self._initialize_agent()
|
796 |
|
797 |
|
@@ -915,6 +920,19 @@ Here are some tips for you:
|
|
915 |
observation_result: str = resp_dict.get("observation", "")
|
916 |
reasoning_result: str = resp_dict.get("reasoning", "")
|
917 |
action_code: str = resp_dict.get("action_code", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
918 |
action_code = action_code.replace("`", "").strip()
|
919 |
|
920 |
return observation_result, reasoning_result, action_code
|
@@ -961,8 +979,6 @@ Here are some tips for you:
|
|
961 |
r"""Get the final answer based on the task prompt and current browser state.
|
962 |
It is used when the agent thinks that the task can be completed without any further action, and answer can be directly found in the current viewport.
|
963 |
"""
|
964 |
-
# screenshot, _ = self.browser.get_screenshot()
|
965 |
-
# img = _reload_image(screenshot)
|
966 |
|
967 |
prompt = f"""
|
968 |
We are solving a complex web task which needs multi-step browser interaction. After the multi-step observation, reasoning and acting with web browser, we think that the task is currently solved.
|
@@ -1078,37 +1094,36 @@ Your output should be in json format, including the following fields:
|
|
1078 |
return False, replanned_schema
|
1079 |
|
1080 |
|
|
|
1081 |
def browser_simulation(self,
|
1082 |
task_prompt: str,
|
1083 |
start_url: str,
|
|
|
1084 |
) -> str:
|
1085 |
r"""A powerful toolkit which can simulate the browser interaction to solve the task which needs multi-step actions.
|
1086 |
|
1087 |
Args:
|
1088 |
task_prompt (str): The task prompt to solve.
|
1089 |
start_url (str): The start URL to visit.
|
|
|
1090 |
|
1091 |
Returns:
|
1092 |
str: The simulation result to the task.
|
1093 |
"""
|
1094 |
-
|
1095 |
-
ROUND_LIMIT = 12
|
1096 |
-
|
1097 |
self._reset()
|
1098 |
task_completed = False
|
1099 |
-
|
1100 |
detailed_plan = self._task_planning(task_prompt, start_url)
|
1101 |
logger.debug(f"Detailed plan: {detailed_plan}")
|
1102 |
|
1103 |
self.browser.init()
|
1104 |
self.browser.visit_page(start_url)
|
1105 |
|
1106 |
-
for i in range(
|
1107 |
observation, reasoning, action_code = self._observe(task_prompt, detailed_plan)
|
1108 |
logger.debug(f"Observation: {observation}")
|
1109 |
logger.debug(f"Reasoning: {reasoning}")
|
1110 |
logger.debug(f"Action code: {action_code}")
|
1111 |
-
# breakpoint()
|
1112 |
|
1113 |
if "stop" in action_code:
|
1114 |
task_completed = True
|
|
|
5 |
from loguru import logger
|
6 |
from typing import Any, Dict, List, TypedDict, Union, BinaryIO
|
7 |
from PIL import Image, ImageDraw, ImageFont
|
|
|
8 |
from html2text import html2text
|
9 |
from retry import retry
|
10 |
from copy import deepcopy
|
|
|
15 |
from camel.agents import ChatAgent
|
16 |
from camel.models import ModelFactory, BaseModelBackend
|
17 |
from camel.types import ModelType, ModelPlatformType
|
18 |
+
from camel.utils import dependencies_required
|
19 |
|
20 |
import io
|
21 |
import random
|
|
|
49 |
|
50 |
ACTION_WITH_FEEDBACK_LIST = [
|
51 |
'ask_question_about_video',
|
52 |
+
'download_file_id',
|
53 |
+
'find_text_on_page',
|
54 |
]
|
55 |
|
56 |
|
|
|
128 |
fixed_text = re.sub(r'`([^`]*)`', r'"\1"', text)
|
129 |
return json.loads(fixed_text)
|
130 |
except json.JSONDecodeError:
|
131 |
+
# Try to extract key fields
|
132 |
result = {}
|
133 |
try:
|
134 |
bool_pattern = r'"(\w+)"\s*:\s*(true|false)'
|
135 |
for match in re.finditer(bool_pattern, text, re.IGNORECASE):
|
136 |
key, value = match.groups()
|
137 |
result[key] = value.lower() == "true"
|
138 |
+
|
139 |
str_pattern = r'"(\w+)"\s*:\s*"([^"]*)"'
|
140 |
for match in re.finditer(str_pattern, text):
|
141 |
key, value = match.groups()
|
142 |
result[key] = value
|
143 |
+
|
144 |
num_pattern = r'"(\w+)"\s*:\s*(-?\d+(?:\.\d+)?)'
|
145 |
for match in re.finditer(num_pattern, text):
|
146 |
key, value = match.groups()
|
|
|
223 |
if isinstance(screenshot, bytes):
|
224 |
screenshot = io.BytesIO(screenshot)
|
225 |
|
|
|
226 |
image = Image.open(cast(BinaryIO, screenshot))
|
227 |
comp, visible_rects, rects_above, rects_below = _add_set_of_mark(image, ROIs)
|
228 |
image.close()
|
|
|
301 |
class BaseBrowser:
|
302 |
def __init__(self,
|
303 |
headless=True,
|
304 |
+
cache_dir: Optional[str] = None):
|
|
|
305 |
r"""Initialize the WebBrowserToolkit instance.
|
306 |
|
307 |
Args:
|
|
|
326 |
|
327 |
self.page_history = [] # stores the history of visited pages
|
328 |
|
|
|
329 |
# set the cache directory
|
330 |
self.cache_dir = "tmp/"
|
331 |
os.makedirs(self.cache_dir, exist_ok=True)
|
332 |
if cache_dir is not None:
|
333 |
self.cache_dir = cache_dir
|
334 |
+
|
335 |
# load the page script
|
336 |
+
abs_dir_path = os.path.dirname(os.path.abspath(__file__))
|
337 |
+
page_script_path = os.path.join(abs_dir_path, "page_script.js")
|
|
|
338 |
|
339 |
try:
|
340 |
with open(page_script_path, "r", encoding='utf-8') as f:
|
341 |
self.page_script = f.read()
|
342 |
f.close()
|
343 |
except FileNotFoundError:
|
344 |
+
raise FileNotFoundError(f"Page script file not found at path: {page_script_path}")
|
345 |
+
|
346 |
|
347 |
def init(self):
|
348 |
r"""Initialize the browser."""
|
|
|
365 |
# self.page.wait_for_load_state("networkidle", timeout=timeout_ms)
|
366 |
# self.page.wait_for_load_state("domcontentloaded", timeout=timeout_ms)
|
367 |
time.sleep(2)
|
368 |
+
|
369 |
|
370 |
def click_blank_area(self):
|
371 |
r"""Click a blank area of the page to unfocus the current element."""
|
|
|
530 |
comp.save(f, "PNG")
|
531 |
f.close()
|
532 |
|
533 |
+
return comp, file_path
|
534 |
|
535 |
|
536 |
def scroll_up(self) -> None:
|
|
|
550 |
identifier = str(identifier)
|
551 |
target = self.page.locator(f"[__elementId='{identifier}']")
|
552 |
|
|
|
553 |
try:
|
554 |
target.wait_for(timeout=5000)
|
555 |
except TimeoutError:
|
556 |
raise ValueError("No such element.") from None
|
557 |
|
|
|
558 |
target.scroll_into_view_if_needed()
|
559 |
|
560 |
new_page = None
|
|
|
568 |
if new_page:
|
569 |
self.page_history.append(deepcopy(self.page.url))
|
570 |
self.page = new_page
|
571 |
+
|
572 |
except PlaywrightError:
|
573 |
pass
|
574 |
+
|
575 |
self._wait_for_load()
|
576 |
|
577 |
|
578 |
def extract_url_content(self):
|
579 |
r"""Extract the content of the current page."""
|
|
|
580 |
content = self.page.content()
|
581 |
return content
|
582 |
|
|
|
754 |
}
|
755 |
""")
|
756 |
|
|
|
757 |
@retry(requests.RequestException)
|
758 |
def get_webpage_content(self) -> str:
|
759 |
self._wait_for_load()
|
|
|
763 |
return markdown_content
|
764 |
|
765 |
|
|
|
766 |
class WebToolkit(BaseToolkit):
|
767 |
+
r"""A class for browsing the web and interacting with web pages.
|
768 |
+
|
769 |
+
This class provides methods for browsing the web and interacting with web pages.
|
770 |
+
"""
|
771 |
def __init__(self,
|
772 |
+
headless: bool = True,
|
773 |
cache_dir: Optional[str] = None,
|
|
|
774 |
history_window: int = 5,
|
775 |
web_agent_model: Optional[BaseModelBackend] = None,
|
776 |
planning_agent_model: Optional[BaseModelBackend] = None,
|
777 |
+
output_language: str = "en",
|
778 |
):
|
779 |
+
r"""Initialize the WebToolkit instance.
|
780 |
+
|
781 |
+
Args:
|
782 |
+
headless (bool): Whether to run the browser in headless mode.
|
783 |
+
cache_dir (Union[str, None]): The directory to store cache files.
|
784 |
+
history_window (int): The window size for storing the history of actions.
|
785 |
+
web_agent_model (Optional[BaseModelBackend]): The model backend for the web agent.
|
786 |
+
planning_agent_model (Optional[BaseModelBackend]): The model backend for the planning agent.
|
787 |
+
"""
|
788 |
|
789 |
self.browser = BaseBrowser(
|
790 |
headless=headless,
|
791 |
+
cache_dir=cache_dir
|
|
|
792 |
)
|
793 |
|
794 |
self.history_window = history_window
|
|
|
797 |
self.output_language = output_language
|
798 |
|
799 |
self.history = []
|
|
|
800 |
self.web_agent, self.planning_agent = self._initialize_agent()
|
801 |
|
802 |
|
|
|
920 |
observation_result: str = resp_dict.get("observation", "")
|
921 |
reasoning_result: str = resp_dict.get("reasoning", "")
|
922 |
action_code: str = resp_dict.get("action_code", "")
|
923 |
+
|
924 |
+
if action_code and "(" in action_code and ")" not in action_code:
|
925 |
+
action_match = re.search(r'"action_code"\s*:\s*[`"]([^`"]*\([^)]*\))[`"]', resp_content)
|
926 |
+
if action_match:
|
927 |
+
action_code = action_match.group(1)
|
928 |
+
else:
|
929 |
+
logger.warning(f"Incomplete action_code detected: {action_code}")
|
930 |
+
if action_code.startswith("fill_input_id("):
|
931 |
+
parts = action_code.split(",", 1)
|
932 |
+
if len(parts) > 1:
|
933 |
+
id_part = parts[0].replace("fill_input_id(", "").strip()
|
934 |
+
action_code = f'fill_input_id({id_part}, "Please fill the text here.")'
|
935 |
+
|
936 |
action_code = action_code.replace("`", "").strip()
|
937 |
|
938 |
return observation_result, reasoning_result, action_code
|
|
|
979 |
r"""Get the final answer based on the task prompt and current browser state.
|
980 |
It is used when the agent thinks that the task can be completed without any further action, and answer can be directly found in the current viewport.
|
981 |
"""
|
|
|
|
|
982 |
|
983 |
prompt = f"""
|
984 |
We are solving a complex web task which needs multi-step browser interaction. After the multi-step observation, reasoning and acting with web browser, we think that the task is currently solved.
|
|
|
1094 |
return False, replanned_schema
|
1095 |
|
1096 |
|
1097 |
+
@dependencies_required("playwright")
|
1098 |
def browser_simulation(self,
|
1099 |
task_prompt: str,
|
1100 |
start_url: str,
|
1101 |
+
round_limit: int = 12
|
1102 |
) -> str:
|
1103 |
r"""A powerful toolkit which can simulate the browser interaction to solve the task which needs multi-step actions.
|
1104 |
|
1105 |
Args:
|
1106 |
task_prompt (str): The task prompt to solve.
|
1107 |
start_url (str): The start URL to visit.
|
1108 |
+
round_limit (int): The round limit to solve the task (default: 12).
|
1109 |
|
1110 |
Returns:
|
1111 |
str: The simulation result to the task.
|
1112 |
"""
|
1113 |
+
|
|
|
|
|
1114 |
self._reset()
|
1115 |
task_completed = False
|
|
|
1116 |
detailed_plan = self._task_planning(task_prompt, start_url)
|
1117 |
logger.debug(f"Detailed plan: {detailed_plan}")
|
1118 |
|
1119 |
self.browser.init()
|
1120 |
self.browser.visit_page(start_url)
|
1121 |
|
1122 |
+
for i in range(round_limit):
|
1123 |
observation, reasoning, action_code = self._observe(task_prompt, detailed_plan)
|
1124 |
logger.debug(f"Observation: {observation}")
|
1125 |
logger.debug(f"Reasoning: {reasoning}")
|
1126 |
logger.debug(f"Action code: {action_code}")
|
|
|
1127 |
|
1128 |
if "stop" in action_code:
|
1129 |
task_completed = True
|