katiue commited on
Commit
c13f7a1
·
verified ·
1 Parent(s): fc29f87

Upload folder using huggingface_hub

Browse files
.env.example ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ OPENAI_ENDPOINT=https://api.openai.com/v1
2
+ OPENAI_API_KEY=
3
+
4
+ ANTHROPIC_API_KEY=
5
+
6
+ GOOGLE_API_KEY=
7
+
8
+ AZURE_OPENAI_ENDPOINT=
9
+ AZURE_OPENAI_API_KEY=
10
+
11
+ DEEPSEEK_ENDPOINT=https://api.deepseek.com
12
+ DEEPSEEK_API_KEY=
13
+
14
+ # Set to false to disable anonymized telemetry
15
+ ANONYMIZED_TELEMETRY=true
16
+
17
+ # LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info
18
+ BROWSER_USE_LOGGING_LEVEL=info
19
+
20
+ CHROME_PATH=
21
+ CHROME_USER_DATA=
.gitignore ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+ test_env/
133
+
134
+
135
+ # Spyder project settings
136
+ .spyderproject
137
+ .spyproject
138
+
139
+ # Rope project settings
140
+ .ropeproject
141
+
142
+ # mkdocs documentation
143
+ /site
144
+
145
+ # mypy
146
+ .mypy_cache/
147
+ .dmypy.json
148
+ dmypy.json
149
+
150
+ # Pyre type checker
151
+ .pyre/
152
+
153
+ # pytype static type analyzer
154
+ .pytype/
155
+
156
+ # Cython debug symbols
157
+ cython_debug/
158
+
159
+ # PyCharm
160
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
163
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164
+ .idea/
165
+ temp
166
+ tmp
167
+
168
+
169
+ .DS_Store
170
+
171
+ private_example.py
172
+ private_example
173
+
174
+ browser_cookies.json
175
+ cookies.json
176
+ AgentHistory.json
177
+ cv_04_24.pdf
178
+ AgentHistoryList.json
179
+ *.gif
180
+ .vercel
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
README.md CHANGED
Binary files a/README.md and b/README.md differ
 
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ browser-use
2
+ langchain-google-genai
3
+ pyperclip
4
+ gradio
5
+ python-dotenv
6
+ argparse
src/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2025/1/1
3
+ # @Author : wenshao
4
+ # @Email : [email protected]
5
+ # @Project : browser-use-webui
6
+ # @FileName: __init__.py.py
src/agent/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2025/1/1
3
+ # @Author : wenshao
4
+ # @Email : [email protected]
5
+ # @Project : browser-use-webui
6
+ # @FileName: __init__.py.py
src/agent/custom_agent.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2025/1/2
3
+ # @Author : wenshao
4
+ # @ProjectName: browser-use-webui
5
+ # @FileName: custom_agent.py
6
+
7
+ import asyncio
8
+ import base64
9
+ import io
10
+ import json
11
+ import logging
12
+ import os
13
+ import pdb
14
+ import textwrap
15
+ import time
16
+ import uuid
17
+ from io import BytesIO
18
+ from pathlib import Path
19
+ from typing import Any, Optional, Type, TypeVar
20
+
21
+ from dotenv import load_dotenv
22
+ from langchain_core.language_models.chat_models import BaseChatModel
23
+ from langchain_core.messages import (
24
+ BaseMessage,
25
+ SystemMessage,
26
+ )
27
+ from openai import RateLimitError
28
+ from PIL import Image, ImageDraw, ImageFont
29
+ from pydantic import BaseModel, ValidationError
30
+
31
+ from browser_use.agent.message_manager.service import MessageManager
32
+ from browser_use.agent.prompts import AgentMessagePrompt, SystemPrompt
33
+ from browser_use.agent.service import Agent
34
+ from browser_use.agent.views import (
35
+ ActionResult,
36
+ AgentError,
37
+ AgentHistory,
38
+ AgentHistoryList,
39
+ AgentOutput,
40
+ AgentStepInfo,
41
+ )
42
+ from browser_use.browser.browser import Browser
43
+ from browser_use.browser.context import BrowserContext
44
+ from browser_use.browser.views import BrowserState, BrowserStateHistory
45
+ from browser_use.controller.registry.views import ActionModel
46
+ from browser_use.controller.service import Controller
47
+ from browser_use.dom.history_tree_processor.service import (
48
+ DOMHistoryElement,
49
+ HistoryTreeProcessor,
50
+ )
51
+ from browser_use.telemetry.service import ProductTelemetry
52
+ from browser_use.telemetry.views import (
53
+ AgentEndTelemetryEvent,
54
+ AgentRunTelemetryEvent,
55
+ AgentStepErrorTelemetryEvent,
56
+ )
57
+ from browser_use.utils import time_execution_async
58
+
59
+ from .custom_views import CustomAgentOutput, CustomAgentStepInfo
60
+ from .custom_massage_manager import CustomMassageManager
61
+
62
+ logger = logging.getLogger(__name__)
63
+
64
+
65
+ class CustomAgent(Agent):
66
+
67
+ def __init__(
68
+ self,
69
+ task: str,
70
+ llm: BaseChatModel,
71
+ add_infos: str = '',
72
+ browser: Browser | None = None,
73
+ browser_context: BrowserContext | None = None,
74
+ controller: Controller = Controller(),
75
+ use_vision: bool = True,
76
+ save_conversation_path: Optional[str] = None,
77
+ max_failures: int = 5,
78
+ retry_delay: int = 10,
79
+ system_prompt_class: Type[SystemPrompt] = SystemPrompt,
80
+ max_input_tokens: int = 128000,
81
+ validate_output: bool = False,
82
+ include_attributes: list[str] = [
83
+ 'title',
84
+ 'type',
85
+ 'name',
86
+ 'role',
87
+ 'tabindex',
88
+ 'aria-label',
89
+ 'placeholder',
90
+ 'value',
91
+ 'alt',
92
+ 'aria-expanded',
93
+ ],
94
+ max_error_length: int = 400,
95
+ max_actions_per_step: int = 10,
96
+ ):
97
+ super().__init__(task, llm, browser, browser_context, controller, use_vision, save_conversation_path,
98
+ max_failures, retry_delay, system_prompt_class, max_input_tokens, validate_output,
99
+ include_attributes, max_error_length, max_actions_per_step)
100
+ self.add_infos = add_infos
101
+ self.message_manager = CustomMassageManager(
102
+ llm=self.llm,
103
+ task=self.task,
104
+ action_descriptions=self.controller.registry.get_prompt_description(),
105
+ system_prompt_class=self.system_prompt_class,
106
+ max_input_tokens=self.max_input_tokens,
107
+ include_attributes=self.include_attributes,
108
+ max_error_length=self.max_error_length,
109
+ max_actions_per_step=self.max_actions_per_step,
110
+ )
111
+
112
+ def _setup_action_models(self) -> None:
113
+ """Setup dynamic action models from controller's registry"""
114
+ # Get the dynamic action model from controller's registry
115
+ self.ActionModel = self.controller.registry.create_action_model()
116
+ # Create output model with the dynamic actions
117
+ self.AgentOutput = CustomAgentOutput.type_with_custom_actions(self.ActionModel)
118
+
119
+ def _log_response(self, response: CustomAgentOutput) -> None:
120
+ """Log the model's response"""
121
+ if 'Success' in response.current_state.prev_action_evaluation:
122
+ emoji = '✅'
123
+ elif 'Failed' in response.current_state.prev_action_evaluation:
124
+ emoji = '❌'
125
+ else:
126
+ emoji = '🤷'
127
+
128
+ logger.info(f'{emoji} Eval: {response.current_state.prev_action_evaluation}')
129
+ logger.info(f'🧠 New Memory: {response.current_state.important_contents}')
130
+ logger.info(f'⏳ Task Progress: {response.current_state.completed_contents}')
131
+ logger.info(f'🤔 Thought: {response.current_state.thought}')
132
+ logger.info(f'🎯 Summary: {response.current_state.summary}')
133
+ for i, action in enumerate(response.action):
134
+ logger.info(
135
+ f'🛠️ Action {i + 1}/{len(response.action)}: {action.model_dump_json(exclude_unset=True)}'
136
+ )
137
+
138
+ def update_step_info(self, model_output: CustomAgentOutput, step_info: CustomAgentStepInfo = None):
139
+ """
140
+ update step info
141
+ """
142
+ if step_info is None:
143
+ return
144
+
145
+ step_info.step_number += 1
146
+ important_contents = model_output.current_state.important_contents
147
+ if important_contents and 'None' not in important_contents and important_contents not in step_info.memory:
148
+ step_info.memory += important_contents + '\n'
149
+
150
+ completed_contents = model_output.current_state.completed_contents
151
+ if completed_contents and 'None' not in completed_contents:
152
+ step_info.task_progress = completed_contents
153
+
154
+ @time_execution_async('--get_next_action')
155
+ async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput:
156
+ """Get next action from LLM based on current state"""
157
+
158
+ ret = self.llm.invoke(input_messages)
159
+ parsed_json = json.loads(ret.content.replace('```json', '').replace("```", ""))
160
+ parsed: AgentOutput = self.AgentOutput(**parsed_json)
161
+ # cut the number of actions to max_actions_per_step
162
+ parsed.action = parsed.action[: self.max_actions_per_step]
163
+ self._log_response(parsed)
164
+ self.n_steps += 1
165
+
166
+ return parsed
167
+
168
+ @time_execution_async('--step')
169
+ async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
170
+ """Execute one step of the task"""
171
+ logger.info(f'\n📍 Step {self.n_steps}')
172
+ state = None
173
+ model_output = None
174
+ result: list[ActionResult] = []
175
+
176
+ try:
177
+ state = await self.browser_context.get_state(use_vision=self.use_vision)
178
+ self.message_manager.add_state_message(state, self._last_result, step_info)
179
+ input_messages = self.message_manager.get_messages()
180
+ model_output = await self.get_next_action(input_messages)
181
+ self.update_step_info(model_output, step_info)
182
+ logger.info(f'🧠 All Memory: {step_info.memory}')
183
+ self._save_conversation(input_messages, model_output)
184
+ self.message_manager._remove_last_state_message() # we dont want the whole state in the chat history
185
+ self.message_manager.add_model_output(model_output)
186
+
187
+ result: list[ActionResult] = await self.controller.multi_act(
188
+ model_output.action, self.browser_context
189
+ )
190
+ self._last_result = result
191
+
192
+ if len(result) > 0 and result[-1].is_done:
193
+ logger.info(f'📄 Result: {result[-1].extracted_content}')
194
+
195
+ self.consecutive_failures = 0
196
+
197
+ except Exception as e:
198
+ result = self._handle_step_error(e)
199
+ self._last_result = result
200
+
201
+ finally:
202
+ if not result:
203
+ return
204
+ for r in result:
205
+ if r.error:
206
+ self.telemetry.capture(
207
+ AgentStepErrorTelemetryEvent(
208
+ agent_id=self.agent_id,
209
+ error=r.error,
210
+ )
211
+ )
212
+ if state:
213
+ self._make_history_item(model_output, state, result)
214
+
215
+ async def run(self, max_steps: int = 100) -> AgentHistoryList:
216
+ """Execute the task with maximum number of steps"""
217
+ try:
218
+ logger.info(f'🚀 Starting task: {self.task}')
219
+
220
+ self.telemetry.capture(
221
+ AgentRunTelemetryEvent(
222
+ agent_id=self.agent_id,
223
+ task=self.task,
224
+ )
225
+ )
226
+
227
+ step_info = CustomAgentStepInfo(task=self.task,
228
+ add_infos=self.add_infos,
229
+ step_number=1,
230
+ max_steps=max_steps,
231
+ memory='',
232
+ task_progress=''
233
+ )
234
+
235
+ for step in range(max_steps):
236
+ if self._too_many_failures():
237
+ break
238
+
239
+ await self.step(step_info)
240
+
241
+ if self.history.is_done():
242
+ if (
243
+ self.validate_output and step < max_steps - 1
244
+ ): # if last step, we dont need to validate
245
+ if not await self._validate_output():
246
+ continue
247
+
248
+ logger.info('✅ Task completed successfully')
249
+ break
250
+ else:
251
+ logger.info('❌ Failed to complete task in maximum steps')
252
+
253
+ return self.history
254
+
255
+ finally:
256
+ self.telemetry.capture(
257
+ AgentEndTelemetryEvent(
258
+ agent_id=self.agent_id,
259
+ task=self.task,
260
+ success=self.history.is_done(),
261
+ steps=len(self.history.history),
262
+ )
263
+ )
264
+ if not self.injected_browser_context:
265
+ await self.browser_context.close()
266
+
267
+ if not self.injected_browser and self.browser:
268
+ await self.browser.close()
src/agent/custom_massage_manager.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2025/1/2
3
+ # @Author : wenshao
4
+ # @ProjectName: browser-use-webui
5
+ # @FileName: custom_massage_manager.py
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ from datetime import datetime
11
+ from typing import List, Optional, Type
12
+
13
+ from langchain_anthropic import ChatAnthropic
14
+ from langchain_core.language_models import BaseChatModel
15
+ from langchain_core.messages import (
16
+ AIMessage,
17
+ BaseMessage,
18
+ HumanMessage,
19
+ )
20
+ from langchain_openai import ChatOpenAI
21
+
22
+ from browser_use.agent.message_manager.views import MessageHistory, MessageMetadata
23
+ from browser_use.agent.prompts import AgentMessagePrompt, SystemPrompt
24
+ from browser_use.agent.views import ActionResult, AgentOutput, AgentStepInfo
25
+ from browser_use.browser.views import BrowserState
26
+ from browser_use.agent.message_manager.service import MessageManager
27
+
28
+ from .custom_prompts import CustomAgentMessagePrompt
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class CustomMassageManager(MessageManager):
34
+ def __init__(
35
+ self,
36
+ llm: BaseChatModel,
37
+ task: str,
38
+ action_descriptions: str,
39
+ system_prompt_class: Type[SystemPrompt],
40
+ max_input_tokens: int = 128000,
41
+ estimated_tokens_per_character: int = 3,
42
+ image_tokens: int = 800,
43
+ include_attributes: list[str] = [],
44
+ max_error_length: int = 400,
45
+ max_actions_per_step: int = 10,
46
+ ):
47
+ super().__init__(llm, task, action_descriptions, system_prompt_class, max_input_tokens,
48
+ estimated_tokens_per_character, image_tokens, include_attributes, max_error_length,
49
+ max_actions_per_step)
50
+
51
+ # Move Task info to state_message
52
+ self.history = MessageHistory()
53
+ self._add_message_with_tokens(self.system_prompt)
54
+
55
+ def add_state_message(
56
+ self,
57
+ state: BrowserState,
58
+ result: Optional[List[ActionResult]] = None,
59
+ step_info: Optional[AgentStepInfo] = None,
60
+ ) -> None:
61
+ """Add browser state as human message"""
62
+
63
+ # if keep in memory, add to directly to history and add state without result
64
+ if result:
65
+ for r in result:
66
+ if r.include_in_memory:
67
+ if r.extracted_content:
68
+ msg = HumanMessage(content=str(r.extracted_content))
69
+ self._add_message_with_tokens(msg)
70
+ if r.error:
71
+ msg = HumanMessage(content=str(r.error)[-self.max_error_length:])
72
+ self._add_message_with_tokens(msg)
73
+ result = None # if result in history, we dont want to add it again
74
+
75
+ # otherwise add state message and result to next message (which will not stay in memory)
76
+ state_message = CustomAgentMessagePrompt(
77
+ state,
78
+ result,
79
+ include_attributes=self.include_attributes,
80
+ max_error_length=self.max_error_length,
81
+ step_info=step_info,
82
+ ).get_user_message()
83
+ self._add_message_with_tokens(state_message)
src/agent/custom_prompts.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2025/1/2
3
+ # @Author : wenshao
4
+ # @ProjectName: browser-use-webui
5
+ # @FileName: custom_prompts.py
6
+
7
+ from datetime import datetime
8
+ from typing import List, Optional
9
+
10
+ from langchain_core.messages import HumanMessage, SystemMessage
11
+
12
+ from browser_use.agent.views import ActionResult, AgentStepInfo
13
+ from browser_use.browser.views import BrowserState
14
+ from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
15
+
16
+ from .custom_views import CustomAgentStepInfo
17
+
18
+
19
+ class CustomSystemPrompt(SystemPrompt):
20
+ def important_rules(self) -> str:
21
+ """
22
+ Returns the important rules for the agent.
23
+ """
24
+ text = """
25
+ 1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
26
+ {
27
+ "current_state": {
28
+ "prev_action_evaluation": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Ignore the action result. The website is the ground truth. Also mention if something unexpected happened like new suggestions in an input field. Shortly state why/why not. Note that the result you output must be consistent with the reasoning you output afterwards. If you consider it to be 'Failed,' you should reflect on this during your thought.",
29
+ "important_contents": "Output important contents closely related to user\'s instruction or task on the current page. If there is, please output the contents. If not, please output \"None\".",
30
+ "completed_contents": "Update the input Task Progress. Completed contents is a general summary of the current contents that have been completed. Just summarize the contents that have been actually completed based on the current page and the history operations. Please list each completed item individually, such as: 1. Input username. 2. Input Password. 3. Click confirm button",
31
+ "thought": "Think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. If the output of prev_action_evaluation is 'Failed', please reflect and output your reflection here. If you think you have entered the wrong page, consider to go back to the previous page in next action.",
32
+ "summary": "Please generate a brief natural language description for the operation in next actions based on your Thought."
33
+ },
34
+ "action": [
35
+ {
36
+ "action_name": {
37
+ // action-specific parameters
38
+ }
39
+ },
40
+ // ... more actions in sequence
41
+ ]
42
+ }
43
+
44
+ 2. ACTIONS: You can specify multiple actions to be executed in sequence.
45
+
46
+ Common action sequences:
47
+ - Form filling: [
48
+ {"input_text": {"index": 1, "text": "username"}},
49
+ {"input_text": {"index": 2, "text": "password"}},
50
+ {"click_element": {"index": 3}}
51
+ ]
52
+ - Navigation and extraction: [
53
+ {"open_new_tab": {}},
54
+ {"go_to_url": {"url": "https://example.com"}},
55
+ {"extract_page_content": {}}
56
+ ]
57
+
58
+
59
+ 3. ELEMENT INTERACTION:
60
+ - Only use indexes that exist in the provided element list
61
+ - Each element has a unique index number (e.g., "33[:]<button>")
62
+ - Elements marked with "_[:]" are non-interactive (for context only)
63
+
64
+ 4. NAVIGATION & ERROR HANDLING:
65
+ - If no suitable elements exist, use other functions to complete the task
66
+ - If stuck, try alternative approaches
67
+ - Handle popups/cookies by accepting or closing them
68
+ - Use scroll to find elements you are looking for
69
+
70
+ 5. TASK COMPLETION:
71
+ - If you think all the requirements of user\'s instruction have been completed and no further operation is required, output the done action to terminate the operation process.
72
+ - Don't hallucinate actions.
73
+ - If the task requires specific information - make sure to include everything in the done function. This is what the user will see.
74
+ - If you are running out of steps (current step), think about speeding it up, and ALWAYS use the done action as the last action.
75
+
76
+ 6. VISUAL CONTEXT:
77
+ - When an image is provided, use it to understand the page layout
78
+ - Bounding boxes with labels correspond to element indexes
79
+ - Each bounding box and its label have the same color
80
+ - Most often the label is inside the bounding box, on the top right
81
+ - Visual context helps verify element locations and relationships
82
+ - sometimes labels overlap, so use the context to verify the correct element
83
+
84
+ 7. Form filling:
85
+ - If you fill a input field and your action sequence is interrupted, most often a list with suggestions poped up under the field and you need to first select the right element from the suggestion list.
86
+
87
+ 8. ACTION SEQUENCING:
88
+ - Actions are executed in the order they appear in the list
89
+ - Each action should logically follow from the previous one
90
+ - If the page changes after an action, the sequence is interrupted and you get the new state.
91
+ - If content only disappears the sequence continues.
92
+ - Only provide the action sequence until you think the page will change.
93
+ - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page like saving, extracting, checkboxes...
94
+ - only use multiple actions if it makes sense.
95
+ """
96
+ text += f' - use maximum {self.max_actions_per_step} actions per sequence'
97
+ return text
98
+
99
+ def input_format(self) -> str:
100
+ return """
101
+ INPUT STRUCTURE:
102
+ 1. Task: The user\'s instructions you need to complete.
103
+ 2. Hints(Optional): Some hints to help you complete the user\'s instructions.
104
+ 3. Memory: Important contents are recorded during historical operations for use in subsequent operations.
105
+ 4. Task Progress: Up to the current page, the content you have completed can be understood as the progress of the task.
106
+ 5. Current URL: The webpage you're currently on
107
+ 6. Available Tabs: List of open browser tabs
108
+ 7. Interactive Elements: List in the format:
109
+ index[:]<element_type>element_text</element_type>
110
+ - index: Numeric identifier for interaction
111
+ - element_type: HTML element type (button, input, etc.)
112
+ - element_text: Visible text or element description
113
+
114
+ Example:
115
+ 33[:]<button>Submit Form</button>
116
+ _[:] Non-interactive text
117
+
118
+
119
+ Notes:
120
+ - Only elements with numeric indexes are interactive
121
+ - _[:] elements provide context but cannot be interacted with
122
+ """
123
+
124
+ def get_system_message(self) -> SystemMessage:
125
+ """
126
+ Get the system prompt for the agent.
127
+
128
+ Returns:
129
+ str: Formatted system prompt
130
+ """
131
+ time_str = self.current_date.strftime('%Y-%m-%d %H:%M')
132
+
133
+ AGENT_PROMPT = f"""You are a precise browser automation agent that interacts with websites through structured commands. Your role is to:
134
+ 1. Analyze the provided webpage elements and structure
135
+ 2. Plan a sequence of actions to accomplish the given task
136
+ 3. Respond with valid JSON containing your action sequence and state assessment
137
+
138
+ Current date and time: {time_str}
139
+
140
+ {self.input_format()}
141
+
142
+ {self.important_rules()}
143
+
144
+ Functions:
145
+ {self.default_action_description}
146
+
147
+ Remember: Your responses must be valid JSON matching the specified format. Each action in the sequence must be valid."""
148
+ return SystemMessage(content=AGENT_PROMPT)
149
+
150
+
151
+ class CustomAgentMessagePrompt:
152
+ def __init__(
153
+ self,
154
+ state: BrowserState,
155
+ result: Optional[List[ActionResult]] = None,
156
+ include_attributes: list[str] = [],
157
+ max_error_length: int = 400,
158
+ step_info: Optional[CustomAgentStepInfo] = None,
159
+ ):
160
+ self.state = state
161
+ self.result = result
162
+ self.max_error_length = max_error_length
163
+ self.include_attributes = include_attributes
164
+ self.step_info = step_info
165
+
166
+ def get_user_message(self) -> HumanMessage:
167
+ state_description = f"""
168
+ 1. Task: {self.step_info.task}
169
+ 2. Hints(Optional):
170
+ {self.step_info.add_infos}
171
+ 3. Memory:
172
+ {self.step_info.memory}
173
+ 4. Task Progress:
174
+ {self.step_info.task_progress}
175
+ 5. Current url: {self.state.url}
176
+ 6. Available tabs:
177
+ {self.state.tabs}
178
+ 7. Interactive elements:
179
+ {self.state.element_tree.clickable_elements_to_string(include_attributes=self.include_attributes)}
180
+ """
181
+
182
+ if self.result:
183
+ for i, result in enumerate(self.result):
184
+ if result.extracted_content:
185
+ state_description += (
186
+ f'\nResult of action {i + 1}/{len(self.result)}: {result.extracted_content}'
187
+ )
188
+ if result.error:
189
+ # only use last 300 characters of error
190
+ error = result.error[-self.max_error_length:]
191
+ state_description += f'\nError of action {i + 1}/{len(self.result)}: ...{error}'
192
+
193
+ if self.state.screenshot:
194
+ # Format message for vision model
195
+ return HumanMessage(
196
+ content=[
197
+ {'type': 'text', 'text': state_description},
198
+ {
199
+ 'type': 'image_url',
200
+ 'image_url': {'url': f'data:image/png;base64,{self.state.screenshot}'},
201
+ },
202
+ ]
203
+ )
204
+
205
+ return HumanMessage(content=state_description)
src/agent/custom_views.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2025/1/2
3
+ # @Author : wenshao
4
+ # @ProjectName: browser-use-webui
5
+ # @FileName: custom_views.py
6
+
7
+ from dataclasses import dataclass
8
+ from typing import Type
9
+ from pydantic import BaseModel, ConfigDict, Field, ValidationError, create_model
10
+ from browser_use.controller.registry.views import ActionModel
11
+ from browser_use.agent.views import AgentOutput
12
+
13
+
14
+ @dataclass
15
+ class CustomAgentStepInfo:
16
+ step_number: int
17
+ max_steps: int
18
+ task: str
19
+ add_infos: str
20
+ memory: str
21
+ task_progress: str
22
+
23
+
24
+ class CustomAgentBrain(BaseModel):
25
+ """Current state of the agent"""
26
+
27
+ prev_action_evaluation: str
28
+ important_contents: str
29
+ completed_contents: str
30
+ thought: str
31
+ summary: str
32
+
33
+
34
+ class CustomAgentOutput(AgentOutput):
35
+ """Output model for agent
36
+
37
+ @dev note: this model is extended with custom actions in AgentService. You can also use some fields that are not in this model as provided by the linter, as long as they are registered in the DynamicActions model.
38
+ """
39
+
40
+ model_config = ConfigDict(arbitrary_types_allowed=True)
41
+
42
+ current_state: CustomAgentBrain
43
+ action: list[ActionModel]
44
+
45
+ @staticmethod
46
+ def type_with_custom_actions(custom_actions: Type[ActionModel]) -> Type['CustomAgentOutput']:
47
+ """Extend actions with custom actions"""
48
+ return create_model(
49
+ 'AgentOutput',
50
+ __base__=CustomAgentOutput,
51
+ action=(list[custom_actions], Field(...)), # Properly annotated field with no default
52
+ __module__=CustomAgentOutput.__module__,
53
+ )
src/browser/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2025/1/1
3
+ # @Author : wenshao
4
+ # @Email : [email protected]
5
+ # @Project : browser-use-webui
6
+ # @FileName: __init__.py.py
src/browser/custom_browser.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2025/1/2
3
+ # @Author : wenshao
4
+ # @ProjectName: browser-use-webui
5
+ # @FileName: custom_browser.py
6
+
7
+ import logging
8
+ from browser_use.browser.browser import Browser, BrowserConfig
9
+ from browser_use.browser.context import BrowserContextConfig, BrowserContext
10
+ from .custom_context import CustomBrowserContext
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ class CustomBrowser(Browser):
15
+ async def new_context(
16
+ self,
17
+ config: BrowserContextConfig = BrowserContextConfig(),
18
+ context=None
19
+ ) -> BrowserContext:
20
+ """Create a browser context with custom implementation"""
21
+ # First get/create the underlying Playwright browser
22
+ playwright_browser = await self.get_playwright_browser()
23
+
24
+ return CustomBrowserContext(
25
+ browser=self, # Pass self instead of playwright browser
26
+ config=config,
27
+ context=context
28
+ )
29
+
30
+ async def get_playwright_browser(self):
31
+ """Ensure we have a Playwright browser instance"""
32
+ if not self.playwright_browser:
33
+ await self._init()
34
+ return self.playwright_browser
src/browser/custom_context.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2025/1/1
3
+ # @Author : wenshao
4
+ # @Email : [email protected]
5
+ # @Project : browser-use-webui
6
+ # @FileName: custom_context.py
7
+
8
+ import asyncio
9
+ import base64
10
+ import json
11
+ import logging
12
+ import os
13
+ from typing import TYPE_CHECKING
14
+
15
+ from playwright.async_api import Browser as PlaywrightBrowser, Page, BrowserContext as PlaywrightContext
16
+ from browser_use.browser.context import BrowserContext, BrowserContextConfig
17
+
18
+ if TYPE_CHECKING:
19
+ from .custom_browser import CustomBrowser
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ class CustomBrowserContext(BrowserContext):
24
+
25
+ def __init__(
26
+ self,
27
+ browser: 'CustomBrowser', # Forward declaration for CustomBrowser
28
+ config: BrowserContextConfig = BrowserContextConfig(),
29
+ context: PlaywrightContext = None
30
+ ):
31
+ super().__init__(browser=browser, config=config) # Add proper inheritance
32
+ self._impl_context = context # Rename to avoid confusion
33
+ self._page = None
34
+ self.session = None # Add session attribute
35
+
36
+ @property
37
+ def impl_context(self) -> PlaywrightContext:
38
+ """Returns the underlying Playwright context implementation"""
39
+ return self._impl_context
40
+
41
+ async def _create_context(self, config: BrowserContextConfig = None):
42
+ """Creates a new browser context"""
43
+ if self._impl_context:
44
+ return self._impl_context
45
+
46
+ # Get the Playwright browser from our custom browser
47
+ pw_browser = await self.browser.get_playwright_browser()
48
+
49
+ context_args = {
50
+ 'viewport': self.config.browser_window_size,
51
+ 'no_viewport': False,
52
+ 'bypass_csp': self.config.disable_security,
53
+ 'ignore_https_errors': self.config.disable_security
54
+ }
55
+
56
+ if self.config.save_recording_path:
57
+ context_args.update({
58
+ 'record_video_dir': self.config.save_recording_path,
59
+ 'record_video_size': self.config.browser_window_size
60
+ })
61
+
62
+ self._impl_context = await pw_browser.new_context(**context_args)
63
+
64
+ # Create an initial page
65
+ self._page = await self._impl_context.new_page()
66
+ await self._page.goto('about:blank') # Ensure page is ready
67
+
68
+ return self._impl_context
69
+
70
+ async def new_page(self) -> Page:
71
+ """Creates and returns a new page in this context"""
72
+ if not self._impl_context:
73
+ await self._create_context()
74
+ return await self._impl_context.new_page()
75
+
76
+ async def __aenter__(self):
77
+ if not self._impl_context:
78
+ await self._create_context()
79
+ return self
80
+
81
+ async def __aexit__(self, *args):
82
+ if self._impl_context:
83
+ await self._impl_context.close()
84
+ self._impl_context = None
85
+
86
+ @property
87
+ def pages(self):
88
+ """Returns list of pages in context"""
89
+ return self._impl_context.pages if self._impl_context else []
90
+
91
+ async def get_state(self, **kwargs):
92
+ if self._impl_context:
93
+ # pages() is a synchronous property, not an async method:
94
+ pages = self._impl_context.pages
95
+ if pages:
96
+ return await super().get_state(**kwargs)
97
+ return None
98
+
99
+ async def get_pages(self):
100
+ """Get pages in a way that works"""
101
+ if not self._impl_context:
102
+ return []
103
+ # Again, pages() is a property:
104
+ return self._impl_context.pages
src/controller/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2025/1/2
3
+ # @Author : wenshao
4
+ # @ProjectName: browser-use-webui
5
+ # @FileName: __init__.py.py
src/controller/custom_controller.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2025/1/2
3
+ # @Author : wenshao
4
+ # @ProjectName: browser-use-webui
5
+ # @FileName: custom_action.py
6
+
7
+ import pyperclip
8
+
9
+ from browser_use.controller.service import Controller
10
+ from browser_use.agent.views import ActionResult
11
+ from browser_use.browser.context import BrowserContext
12
+
13
+
14
+ class CustomController(Controller):
15
+ def __init__(self):
16
+ super().__init__()
17
+ self._register_custom_actions()
18
+
19
+ def _register_custom_actions(self):
20
+ """Register all custom browser actions"""
21
+
22
+ @self.registry.action('Copy text to clipboard')
23
+ def copy_to_clipboard(text: str):
24
+ pyperclip.copy(text)
25
+ return ActionResult(extracted_content=text)
26
+
27
+ @self.registry.action('Paste text from clipboard', requires_browser=True)
28
+ async def paste_from_clipboard(browser: BrowserContext):
29
+ text = pyperclip.paste()
30
+ # send text to browser
31
+ page = await browser.get_current_page()
32
+ await page.keyboard.type(text)
33
+
34
+ return ActionResult(extracted_content=text)
src/utils/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2025/1/1
3
+ # @Author : wenshao
4
+ # @Email : [email protected]
5
+ # @Project : browser-use-webui
6
+ # @FileName: __init__.py.py
src/utils/file_utils.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from pathlib import Path
4
+ from typing import Dict, Optional
5
+
6
+ def get_latest_files(directory: str, file_types: list = ['.webm', '.zip']) -> Dict[str, Optional[str]]:
7
+ """Get the latest recording and trace files"""
8
+ latest_files = {ext: None for ext in file_types}
9
+
10
+ if not os.path.exists(directory):
11
+ os.makedirs(directory, exist_ok=True)
12
+ return latest_files
13
+
14
+ for file_type in file_types:
15
+ try:
16
+ matches = list(Path(directory).rglob(f"*{file_type}"))
17
+ if matches:
18
+ latest = max(matches, key=lambda p: p.stat().st_mtime)
19
+ # Only return files that are complete (not being written)
20
+ if time.time() - latest.stat().st_mtime > 1.0:
21
+ latest_files[file_type] = str(latest)
22
+ except Exception as e:
23
+ print(f"Error getting latest {file_type} file: {e}")
24
+
25
+ return latest_files
src/utils/stream_utils.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import asyncio
3
+ from typing import AsyncGenerator
4
+ from playwright.async_api import BrowserContext, Error as PlaywrightError
5
+
6
+ async def capture_screenshot(browser_context: BrowserContext) -> str:
7
+ """Capture and encode a screenshot"""
8
+ try:
9
+ # Get the implementation context
10
+ context = getattr(browser_context, 'impl_context', None)
11
+ if not context:
12
+ return "<div>No browser context implementation available</div>"
13
+
14
+ # Get all pages
15
+ all_pages = context.pages
16
+ if not all_pages:
17
+ return "<div>Waiting for page to be available...</div>"
18
+ # Use the first page
19
+ page = all_pages[1]
20
+ try:
21
+ screenshot = await page.screenshot(
22
+ type='jpeg',
23
+ quality=75,
24
+ scale="css"
25
+ )
26
+ encoded = base64.b64encode(screenshot).decode('utf-8')
27
+ return f'<img src="data:image/jpeg;base64,{encoded}" style="width:100%; max-width:1200px; border:1px solid #ccc;">'
28
+ except Exception as e:
29
+ return f"<div class='error'>Screenshot failed: {str(e)}</div>"
30
+ except Exception as e:
31
+ return f"<div class='error'>Screenshot error: {str(e)}</div>"
32
+
33
+ async def stream_browser_view(browser_context: BrowserContext) -> AsyncGenerator[str, None]:
34
+ """Stream browser view to the UI"""
35
+ try:
36
+ while True:
37
+ try:
38
+ screenshot_html = await capture_screenshot(browser_context)
39
+ yield screenshot_html
40
+ await asyncio.sleep(0.2) # 5 FPS
41
+ except Exception as e:
42
+ yield f"<div class='error'>Screenshot error: {str(e)}</div>"
43
+ await asyncio.sleep(1) # Wait before retrying
44
+ except Exception as e:
45
+ yield f"<div class='error'>Stream error: {str(e)}</div>"
src/utils/utils.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2025/1/1
3
+ # @Author : wenshao
4
+ # @Email : [email protected]
5
+ # @Project : browser-use-webui
6
+ # @FileName: utils.py
7
+
8
+ import base64
9
+ import os
10
+
11
+ from langchain_openai import ChatOpenAI, AzureChatOpenAI
12
+ from langchain_anthropic import ChatAnthropic
13
+ from langchain_google_genai import ChatGoogleGenerativeAI
14
+
15
+
16
+ def get_llm_model(provider: str, **kwargs):
17
+ """
18
+ 获取LLM 模型
19
+ :param provider: 模型类型
20
+ :param kwargs:
21
+ :return:
22
+ """
23
+ if provider == 'anthropic':
24
+ if not kwargs.get("base_url", ""):
25
+ base_url = "https://api.anthropic.com"
26
+ else:
27
+ base_url = kwargs.get("base_url")
28
+
29
+ if not kwargs.get("api_key", ""):
30
+ api_key = os.getenv("ANTHROPIC_API_KEY", "")
31
+ else:
32
+ api_key = kwargs.get("api_key")
33
+
34
+ return ChatAnthropic(
35
+ model_name=kwargs.get("model_name", 'claude-3-5-sonnet-20240620'),
36
+ temperature=kwargs.get("temperature", 0.0),
37
+ base_url=base_url,
38
+ api_key=api_key
39
+ )
40
+ elif provider == 'openai':
41
+ if not kwargs.get("base_url", ""):
42
+ base_url = "https://api.openai.com/v1"
43
+ else:
44
+ base_url = kwargs.get("base_url")
45
+
46
+ if not kwargs.get("api_key", ""):
47
+ api_key = os.getenv("OPENAI_API_KEY", "")
48
+ else:
49
+ api_key = kwargs.get("api_key")
50
+
51
+ return ChatOpenAI(
52
+ model=kwargs.get("model_name", 'gpt-4o'),
53
+ temperature=kwargs.get("temperature", 0.0),
54
+ base_url=base_url,
55
+ api_key=api_key
56
+ )
57
+ elif provider == 'deepseek':
58
+ if not kwargs.get("base_url", ""):
59
+ base_url = os.getenv("DEEPSEEK_ENDPOINT", "")
60
+ else:
61
+ base_url = kwargs.get("base_url")
62
+
63
+ if not kwargs.get("api_key", ""):
64
+ api_key = os.getenv("DEEPSEEK_API_KEY", "")
65
+ else:
66
+ api_key = kwargs.get("api_key")
67
+
68
+ return ChatOpenAI(
69
+ model=kwargs.get("model_name", 'gpt-4o'),
70
+ temperature=kwargs.get("temperature", 0.0),
71
+ base_url=base_url,
72
+ api_key=api_key
73
+ )
74
+ elif provider == 'gemini':
75
+ if not kwargs.get("api_key", ""):
76
+ api_key = os.getenv("GOOGLE_API_KEY", "")
77
+ else:
78
+ api_key = kwargs.get("api_key")
79
+ return ChatGoogleGenerativeAI(
80
+ model=kwargs.get("model_name", 'gemini-2.0-flash-exp'),
81
+ temperature=kwargs.get("temperature", 0.0),
82
+ google_api_key=api_key,
83
+ )
84
+ elif provider == "azure_openai":
85
+ if not kwargs.get("base_url", ""):
86
+ base_url = os.getenv("AZURE_OPENAI_ENDPOINT", "")
87
+ else:
88
+ base_url = kwargs.get("base_url")
89
+ if not kwargs.get("api_key", ""):
90
+ api_key = os.getenv("AZURE_OPENAI_API_KEY", "")
91
+ else:
92
+ api_key = kwargs.get("api_key")
93
+ return AzureChatOpenAI(
94
+ model=kwargs.get("model_name", 'gpt-4o'),
95
+ temperature=kwargs.get("temperature", 0.0),
96
+ api_version="2024-05-01-preview",
97
+ azure_endpoint=base_url,
98
+ api_key=api_key
99
+ )
100
+ else:
101
+ raise ValueError(f'Unsupported provider: {provider}')
102
+
103
+
104
+ def encode_image(img_path):
105
+ if not img_path:
106
+ return None
107
+ with open(img_path, "rb") as fin:
108
+ image_data = base64.b64encode(fin.read()).decode("utf-8")
109
+ return image_data
tests/test_browser_use.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2025/1/2
3
+ # @Author : wenshao
4
+ # @ProjectName: browser-use-webui
5
+ # @FileName: test_browser_use.py
6
+ import pdb
7
+
8
+ from dotenv import load_dotenv
9
+
10
+ load_dotenv()
11
+ import sys
12
+
13
+ sys.path.append(".")
14
+ import os
15
+ import sys
16
+ from pprint import pprint
17
+
18
+ import asyncio
19
+ from browser_use import Agent
20
+ from browser_use.agent.views import AgentHistoryList
21
+
22
+ from src.utils import utils
23
+
24
+
25
+ async def test_browser_use_org():
26
+ from browser_use.browser.browser import Browser, BrowserConfig
27
+ from browser_use.browser.context import (
28
+ BrowserContext,
29
+ BrowserContextConfig,
30
+ BrowserContextWindowSize,
31
+ )
32
+ llm = utils.get_llm_model(
33
+ provider="azure_openai",
34
+ model_name="gpt-4o",
35
+ temperature=0.8,
36
+ base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
37
+ api_key=os.getenv("AZURE_OPENAI_API_KEY", "")
38
+ )
39
+
40
+ window_w, window_h = 1920, 1080
41
+
42
+ browser = Browser(
43
+ config=BrowserConfig(
44
+ headless=False,
45
+ disable_security=True,
46
+ extra_chromium_args=[f'--window-size={window_w},{window_h}'],
47
+ )
48
+ )
49
+ async with await browser.new_context(
50
+ config=BrowserContextConfig(
51
+ trace_path='./tmp/traces',
52
+ save_recording_path="./tmp/record_videos",
53
+ no_viewport=False,
54
+ browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h),
55
+ )
56
+ ) as browser_context:
57
+ agent = Agent(
58
+ task="go to google.com and type 'OpenAI' click search and give me the first url",
59
+ llm=llm,
60
+ browser_context=browser_context,
61
+ )
62
+ history: AgentHistoryList = await agent.run(max_steps=10)
63
+
64
+ print('Final Result:')
65
+ pprint(history.final_result(), indent=4)
66
+
67
+ print('\nErrors:')
68
+ pprint(history.errors(), indent=4)
69
+
70
+ # e.g. xPaths the model clicked on
71
+ print('\nModel Outputs:')
72
+ pprint(history.model_actions(), indent=4)
73
+
74
+ print('\nThoughts:')
75
+ pprint(history.model_thoughts(), indent=4)
76
+ # close browser
77
+ await browser.close()
78
+
79
+
80
+ async def test_browser_use_custom():
81
+ from playwright.async_api import async_playwright
82
+ from browser_use.browser.context import BrowserContextWindowSize
83
+
84
+ from src.browser.custom_browser import CustomBrowser, BrowserConfig
85
+ from src.browser.custom_context import BrowserContext, BrowserContextConfig
86
+ from src.controller.custom_controller import CustomController
87
+ from src.agent.custom_agent import CustomAgent
88
+ from src.agent.custom_prompts import CustomSystemPrompt
89
+ from src.browser.custom_context import CustomBrowserContext
90
+
91
+ window_w, window_h = 1920, 1080
92
+
93
+ # llm = utils.get_llm_model(
94
+ # provider="azure_openai",
95
+ # model_name="gpt-4o",
96
+ # temperature=0.8,
97
+ # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
98
+ # api_key=os.getenv("AZURE_OPENAI_API_KEY", "")
99
+ # )
100
+
101
+ # llm = utils.get_llm_model(
102
+ # provider="gemini",
103
+ # model_name="gemini-2.0-flash-exp",
104
+ # temperature=1.0,
105
+ # api_key=os.getenv("GOOGLE_API_KEY", "")
106
+ # )
107
+
108
+ llm = utils.get_llm_model(
109
+ provider="deepseek",
110
+ model_name="deepseek-chat",
111
+ temperature=0.8
112
+ )
113
+
114
+ controller = CustomController()
115
+ use_own_browser = False
116
+ disable_security = True
117
+ use_vision = False
118
+ playwright = None
119
+ browser_context_ = None
120
+ try:
121
+ if use_own_browser:
122
+ playwright = await async_playwright().start()
123
+ chrome_exe = os.getenv("CHROME_PATH", "")
124
+ chrome_use_data = os.getenv("CHROME_USER_DATA", "")
125
+ browser_context_ = await playwright.chromium.launch_persistent_context(
126
+ user_data_dir=chrome_use_data,
127
+ executable_path=chrome_exe,
128
+ no_viewport=False,
129
+ headless=False, # 保持浏览器窗口可见
130
+ user_agent=(
131
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
132
+ '(KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
133
+ ),
134
+ java_script_enabled=True,
135
+ bypass_csp=disable_security,
136
+ ignore_https_errors=disable_security,
137
+ record_video_dir="./tmp/record_videos",
138
+ record_video_size={'width': window_w, 'height': window_h}
139
+ )
140
+ else:
141
+ browser_context_ = None
142
+
143
+ browser = CustomBrowser(
144
+ config=BrowserConfig(
145
+ headless=False,
146
+ disable_security=True,
147
+ extra_chromium_args=[f'--window-size={window_w},{window_h}'],
148
+ )
149
+ )
150
+
151
+ async with await browser.new_context(
152
+ config=BrowserContextConfig(
153
+ trace_path='./tmp/result_processing',
154
+ save_recording_path="./tmp/record_videos",
155
+ no_viewport=False,
156
+ browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h),
157
+ ),
158
+ context=browser_context_
159
+ ) as browser_context:
160
+ agent = CustomAgent(
161
+ task="go to google.com and type 'OpenAI' click search and give me the first url",
162
+ add_infos="", # some hints for llm to complete the task
163
+ llm=llm,
164
+ browser_context=browser_context,
165
+ controller=controller,
166
+ system_prompt_class=CustomSystemPrompt,
167
+ use_vision=use_vision
168
+ )
169
+ history: AgentHistoryList = await agent.run(max_steps=10)
170
+
171
+ print('Final Result:')
172
+ pprint(history.final_result(), indent=4)
173
+
174
+ print('\nErrors:')
175
+ pprint(history.errors(), indent=4)
176
+
177
+ # e.g. xPaths the model clicked on
178
+ print('\nModel Outputs:')
179
+ pprint(history.model_actions(), indent=4)
180
+
181
+ print('\nThoughts:')
182
+ pprint(history.model_thoughts(), indent=4)
183
+ # close browser
184
+ except Exception as e:
185
+ import traceback
186
+ traceback.print_exc()
187
+ finally:
188
+ # 显式关闭持久化上下文
189
+ if browser_context_:
190
+ await browser_context_.close()
191
+
192
+ # 关闭 Playwright 对象
193
+ if playwright:
194
+ await playwright.stop()
195
+
196
+ await browser.close()
197
+
198
+
199
+ if __name__ == '__main__':
200
+ # asyncio.run(test_browser_use_org())
201
+ asyncio.run(test_browser_use_custom())
tests/test_llm_api.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2025/1/1
3
+ # @Author : wenshao
4
+ # @Email : [email protected]
5
+ # @Project : browser-use-webui
6
+ # @FileName: test_llm_api.py
7
+ import os
8
+ import pdb
9
+
10
+ from dotenv import load_dotenv
11
+
12
+ load_dotenv()
13
+
14
+ import sys
15
+
16
+ sys.path.append(".")
17
+
18
+
19
+ def test_openai_model():
20
+ from langchain_core.messages import HumanMessage
21
+ from src.utils import utils
22
+
23
+ llm = utils.get_llm_model(
24
+ provider="openai",
25
+ model_name="gpt-4o",
26
+ temperature=0.8,
27
+ base_url=os.getenv("OPENAI_ENDPOINT", ""),
28
+ api_key=os.getenv("OPENAI_API_KEY", "")
29
+ )
30
+ image_path = "assets/examples/test.png"
31
+ image_data = utils.encode_image(image_path)
32
+ message = HumanMessage(
33
+ content=[
34
+ {"type": "text", "text": "describe this image"},
35
+ {
36
+ "type": "image_url",
37
+ "image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
38
+ },
39
+ ]
40
+ )
41
+ ai_msg = llm.invoke([message])
42
+ print(ai_msg.content)
43
+
44
+
45
+ def test_gemini_model():
46
+ # you need to enable your api key first: https://ai.google.dev/palm_docs/oauth_quickstart
47
+ from langchain_core.messages import HumanMessage
48
+ from src.utils import utils
49
+
50
+ llm = utils.get_llm_model(
51
+ provider="gemini",
52
+ model_name="gemini-2.0-flash-exp",
53
+ temperature=0.8,
54
+ api_key=os.getenv("GOOGLE_API_KEY", "")
55
+ )
56
+
57
+ image_path = "assets/examples/test.png"
58
+ image_data = utils.encode_image(image_path)
59
+ message = HumanMessage(
60
+ content=[
61
+ {"type": "text", "text": "describe this image"},
62
+ {
63
+ "type": "image_url",
64
+ "image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
65
+ },
66
+ ]
67
+ )
68
+ ai_msg = llm.invoke([message])
69
+ print(ai_msg.content)
70
+
71
+
72
+ def test_azure_openai_model():
73
+ from langchain_core.messages import HumanMessage
74
+ from src.utils import utils
75
+
76
+ llm = utils.get_llm_model(
77
+ provider="azure_openai",
78
+ model_name="gpt-4o",
79
+ temperature=0.8,
80
+ base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
81
+ api_key=os.getenv("AZURE_OPENAI_API_KEY", "")
82
+ )
83
+ image_path = "assets/examples/test.png"
84
+ image_data = utils.encode_image(image_path)
85
+ message = HumanMessage(
86
+ content=[
87
+ {"type": "text", "text": "describe this image"},
88
+ {
89
+ "type": "image_url",
90
+ "image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
91
+ },
92
+ ]
93
+ )
94
+ ai_msg = llm.invoke([message])
95
+ print(ai_msg.content)
96
+
97
+
98
+ def test_deepseek_model():
99
+ from langchain_core.messages import HumanMessage
100
+ from src.utils import utils
101
+
102
+ llm = utils.get_llm_model(
103
+ provider="deepseek",
104
+ model_name="deepseek-chat",
105
+ temperature=0.8,
106
+ base_url=os.getenv("DEEPSEEK_ENDPOINT", ""),
107
+ api_key=os.getenv("DEEPSEEK_API_KEY", "")
108
+ )
109
+ pdb.set_trace()
110
+ message = HumanMessage(
111
+ content=[
112
+ {"type": "text", "text": "who are you?"}
113
+ ]
114
+ )
115
+ ai_msg = llm.invoke([message])
116
+ print(ai_msg.content)
117
+
118
+
119
+ if __name__ == '__main__':
120
+ # test_openai_model()
121
+ # test_gemini_model()
122
+ # test_azure_openai_model()
123
+ test_deepseek_model()
tests/test_playwright.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2025/1/2
3
+ # @Author : wenshao
4
+ # @Email : [email protected]
5
+ # @Project : browser-use-webui
6
+ # @FileName: test_playwright.py
7
+ import pdb
8
+ from dotenv import load_dotenv
9
+
10
+ load_dotenv()
11
+
12
+
13
+ def test_connect_browser():
14
+ import os
15
+ from playwright.sync_api import sync_playwright
16
+
17
+ chrome_exe = os.getenv("CHROME_PATH", "")
18
+ chrome_use_data = os.getenv("CHROME_USER_DATA", "")
19
+
20
+ with sync_playwright() as p:
21
+ browser = p.chromium.launch_persistent_context(
22
+ user_data_dir=chrome_use_data,
23
+ executable_path=chrome_exe,
24
+ headless=False # 保持浏览器窗口可见
25
+ )
26
+
27
+ page = browser.new_page()
28
+ page.goto("https://mail.google.com/mail/u/0/#inbox")
29
+ page.wait_for_load_state()
30
+
31
+ input("按下回车键以关闭浏览器...")
32
+
33
+ browser.close()
34
+
35
+
36
+ if __name__ == '__main__':
37
+ test_connect_browser()
webui.py ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2025/1/1
3
+ # @Author : wenshao
4
+ # @Email : [email protected]
5
+ # @Project : browser-use-webui
6
+ # @FileName: webui.py
7
+ from dotenv import load_dotenv
8
+ load_dotenv()
9
+ import argparse
10
+ import gradio as gr
11
+ import os
12
+ import asyncio
13
+ from playwright.async_api import async_playwright
14
+ from browser_use.browser.browser import Browser, BrowserConfig
15
+ from browser_use.browser.context import (
16
+ BrowserContextConfig,
17
+ BrowserContextWindowSize,
18
+ )
19
+ from browser_use.agent.service import Agent
20
+ from src.browser.custom_browser import CustomBrowser
21
+ from src.controller.custom_controller import CustomController
22
+ from src.agent.custom_agent import CustomAgent
23
+ from src.agent.custom_prompts import CustomSystemPrompt
24
+
25
+ from src.utils import utils
26
+ from src.utils.file_utils import get_latest_files
27
+ from src.utils.stream_utils import stream_browser_view, capture_screenshot
28
+
29
+
30
+ async def run_browser_agent(
31
+ agent_type,
32
+ llm_provider,
33
+ llm_model_name,
34
+ llm_temperature,
35
+ llm_base_url,
36
+ llm_api_key,
37
+ use_own_browser,
38
+ headless,
39
+ disable_security,
40
+ window_w,
41
+ window_h,
42
+ save_recording_path,
43
+ task,
44
+ add_infos,
45
+ max_steps,
46
+ use_vision,
47
+ browser_context=None # Added optional argument
48
+ ):
49
+ """
50
+ Runs the browser agent based on user configurations.
51
+ """
52
+
53
+ llm = utils.get_llm_model(
54
+ provider=llm_provider,
55
+ model_name=llm_model_name,
56
+ temperature=llm_temperature,
57
+ base_url=llm_base_url,
58
+ api_key=llm_api_key
59
+ )
60
+ if agent_type == "org":
61
+ return await run_org_agent(
62
+ llm=llm,
63
+ headless=headless,
64
+ disable_security=disable_security,
65
+ window_w=window_w,
66
+ window_h=window_h,
67
+ save_recording_path=save_recording_path,
68
+ task=task,
69
+ max_steps=max_steps,
70
+ use_vision=use_vision,
71
+ browser_context=browser_context # pass context
72
+ )
73
+ elif agent_type == "custom":
74
+ return await run_custom_agent(
75
+ llm=llm,
76
+ use_own_browser=use_own_browser,
77
+ headless=headless,
78
+ disable_security=disable_security,
79
+ window_w=window_w,
80
+ window_h=window_h,
81
+ save_recording_path=save_recording_path,
82
+ task=task,
83
+ add_infos=add_infos,
84
+ max_steps=max_steps,
85
+ use_vision=use_vision,
86
+ browser_context=browser_context # pass context
87
+ )
88
+ else:
89
+ raise ValueError(f"Invalid agent type: {agent_type}")
90
+
91
+
92
+ async def run_org_agent(
93
+ llm,
94
+ headless,
95
+ disable_security,
96
+ window_w,
97
+ window_h,
98
+ save_recording_path,
99
+ task,
100
+ max_steps,
101
+ use_vision,
102
+ browser_context=None # receive context
103
+ ):
104
+ browser = None
105
+ if browser_context is None:
106
+ browser = Browser(
107
+ config=BrowserConfig(
108
+ headless=False, # Force non-headless for streaming
109
+ disable_security=disable_security,
110
+ extra_chromium_args=[f'--window-size={window_w},{window_h}'],
111
+ )
112
+ )
113
+ async with await browser.new_context(
114
+ config=BrowserContextConfig(
115
+ trace_path='./tmp/traces',
116
+ save_recording_path=save_recording_path if save_recording_path else None,
117
+ no_viewport=False,
118
+ browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h),
119
+ )
120
+ ) as browser_context_in:
121
+ agent = Agent(
122
+ task=task,
123
+ llm=llm,
124
+ use_vision=use_vision,
125
+ browser_context=browser_context_in,
126
+ )
127
+ history = await agent.run(max_steps=max_steps)
128
+
129
+ final_result = history.final_result()
130
+ errors = history.errors()
131
+ model_actions = history.model_actions()
132
+ model_thoughts = history.model_thoughts()
133
+
134
+ recorded_files = get_latest_files(save_recording_path)
135
+
136
+ await browser.close()
137
+ return final_result, errors, model_actions, model_thoughts, recorded_files.get('.webm'), recorded_files.get('.zip')
138
+ else:
139
+ # Reuse existing context
140
+ agent = Agent(
141
+ task=task,
142
+ llm=llm,
143
+ use_vision=use_vision,
144
+ browser_context=browser_context
145
+ )
146
+ history = await agent.run(max_steps=max_steps)
147
+ final_result = history.final_result()
148
+ errors = history.errors()
149
+ model_actions = history.model_actions()
150
+ model_thoughts = history.model_thoughts()
151
+ recorded_files = get_latest_files(save_recording_path)
152
+ return final_result, errors, model_actions, model_thoughts, recorded_files.get('.webm'), recorded_files.get('.zip')
153
+
154
+
155
+ async def run_custom_agent(
156
+ llm,
157
+ use_own_browser,
158
+ headless,
159
+ disable_security,
160
+ window_w,
161
+ window_h,
162
+ save_recording_path,
163
+ task,
164
+ add_infos,
165
+ max_steps,
166
+ use_vision,
167
+ browser_context=None # receive context
168
+ ):
169
+ controller = CustomController()
170
+ playwright = None
171
+ browser = None
172
+ try:
173
+ if use_own_browser:
174
+ playwright = await async_playwright().start()
175
+ chrome_exe = os.getenv("CHROME_PATH", "")
176
+ chrome_use_data = os.getenv("CHROME_USER_DATA", "")
177
+ browser_context_ = await playwright.chromium.launch_persistent_context(
178
+ user_data_dir=chrome_use_data,
179
+ executable_path=chrome_exe,
180
+ no_viewport=False,
181
+ headless=headless, # 保持浏览器窗口可见
182
+ user_agent=(
183
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
184
+ '(KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
185
+ ),
186
+ java_script_enabled=True,
187
+ bypass_csp=disable_security,
188
+ ignore_https_errors=disable_security,
189
+ record_video_dir=save_recording_path if save_recording_path else None,
190
+ record_video_size={'width': window_w, 'height': window_h}
191
+ )
192
+ else:
193
+ browser_context_ = None
194
+
195
+ if browser_context is not None:
196
+ # Reuse context
197
+ agent = CustomAgent(
198
+ task=task,
199
+ add_infos=add_infos,
200
+ use_vision=use_vision,
201
+ llm=llm,
202
+ browser_context=browser_context,
203
+ controller=controller,
204
+ system_prompt_class=CustomSystemPrompt
205
+ )
206
+ history = await agent.run(max_steps=max_steps)
207
+ final_result = history.final_result()
208
+ errors = history.errors()
209
+ model_actions = history.model_actions()
210
+ model_thoughts = history.model_thoughts()
211
+ recorded_files = get_latest_files(save_recording_path)
212
+ return final_result, errors, model_actions, model_thoughts, recorded_files.get('.webm'), recorded_files.get('.zip')
213
+ else:
214
+ browser = CustomBrowser(
215
+ config=BrowserConfig(
216
+ headless=headless,
217
+ disable_security=disable_security,
218
+ extra_chromium_args=[f'--window-size={window_w},{window_h}'],
219
+ )
220
+ )
221
+ async with await browser.new_context(
222
+ config=BrowserContextConfig(
223
+ trace_path='./tmp/result_processing',
224
+ save_recording_path=save_recording_path if save_recording_path else None,
225
+ no_viewport=False,
226
+ browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h),
227
+ ),
228
+ context=browser_context_
229
+ ) as browser_context_in:
230
+ agent = CustomAgent(
231
+ task=task,
232
+ add_infos=add_infos,
233
+ use_vision=use_vision,
234
+ llm=llm,
235
+ browser_context=browser_context_in,
236
+ controller=controller,
237
+ system_prompt_class=CustomSystemPrompt
238
+ )
239
+ history = await agent.run(max_steps=max_steps)
240
+
241
+ final_result = history.final_result()
242
+ errors = history.errors()
243
+ model_actions = history.model_actions()
244
+ model_thoughts = history.model_thoughts()
245
+
246
+ recorded_files = get_latest_files(save_recording_path)
247
+
248
+ except Exception as e:
249
+ import traceback
250
+ traceback.print_exc()
251
+ final_result = ""
252
+ errors = str(e) + "\n" + traceback.format_exc()
253
+ model_actions = ""
254
+ model_thoughts = ""
255
+ recorded_files = {}
256
+ finally:
257
+ # 显式关闭持久化上下文
258
+ if browser_context_:
259
+ await browser_context_.close()
260
+
261
+ # 关闭 Playwright 对象
262
+ if playwright:
263
+ await playwright.stop()
264
+ if browser:
265
+ await browser.close()
266
+ return final_result, errors, model_actions, model_thoughts, recorded_files.get('.webm'), recorded_files.get('.zip')
267
+
268
+
269
+ async def run_with_stream(*args):
270
+ """Wrapper to run agent and handle streaming"""
271
+ browser = None
272
+ try:
273
+ browser = CustomBrowser(config=BrowserConfig(
274
+ headless=False,
275
+ disable_security=args[8],
276
+ extra_chromium_args=[f'--window-size={args[9]},{args[10]}'],
277
+ ))
278
+
279
+ async with await browser.new_context(
280
+ config=BrowserContextConfig(
281
+ trace_path='./tmp/traces',
282
+ save_recording_path=args[11],
283
+ no_viewport=False,
284
+ browser_window_size=BrowserContextWindowSize(width=args[9], height=args[10]),
285
+ )
286
+ ) as browser_context:
287
+ # No need to explicitly create page - context creation handles it
288
+
289
+ # Run agent in background
290
+ agent_task = asyncio.create_task(run_browser_agent(*args, browser_context=browser_context))
291
+
292
+ # Initialize values
293
+ html_content = "<div>Starting browser...</div>"
294
+ final_result = errors = model_actions = model_thoughts = ""
295
+ recording = trace = None
296
+
297
+ while not agent_task.done():
298
+ try:
299
+ html_content = await capture_screenshot(browser_context)
300
+ except Exception as e:
301
+ html_content = f"<div class='error'>Screenshot error: {str(e)}</div>"
302
+
303
+ yield [html_content, final_result, errors, model_actions, model_thoughts, recording, trace]
304
+ await asyncio.sleep(0.2)
305
+
306
+ # Get agent results when done
307
+ try:
308
+ result = await agent_task
309
+ if isinstance(result, tuple) and len(result) == 6:
310
+ final_result, errors, model_actions, model_thoughts, recording, trace = result
311
+ else:
312
+ errors = "Unexpected result format from agent"
313
+ except Exception as e:
314
+ errors = f"Agent error: {str(e)}"
315
+
316
+ yield [
317
+ html_content,
318
+ final_result,
319
+ errors,
320
+ model_actions,
321
+ model_thoughts,
322
+ recording,
323
+ trace
324
+ ]
325
+
326
+ except Exception as e:
327
+ import traceback
328
+ yield [
329
+ f"<div class='error'>Browser error: {str(e)}</div>",
330
+ "",
331
+ f"Error: {str(e)}\n{traceback.format_exc()}",
332
+ "",
333
+ "",
334
+ None,
335
+ None
336
+ ]
337
+ finally:
338
+ if browser:
339
+ await browser.close()
340
+
341
+
342
+ def main():
343
+ # Gradio UI setup
344
+ with gr.Blocks(title="Browser Use WebUI", theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta Sans")])) as demo:
345
+ gr.Markdown("<center><h1>Browser Use WebUI</h1></center>")
346
+ with gr.Row():
347
+ agent_type = gr.Radio(["org", "custom"], label="Agent Type", value="custom")
348
+ max_steps = gr.Number(label="max run steps", value=100)
349
+ use_vision = gr.Checkbox(label="use vision", value=True)
350
+ with gr.Row():
351
+ llm_provider = gr.Dropdown(
352
+ ["anthropic", "openai", "gemini", "azure_openai", "deepseek"], label="LLM Provider", value="gemini"
353
+ )
354
+ llm_model_name = gr.Textbox(label="LLM Model Name", value="gemini-2.0-flash-exp")
355
+ llm_temperature = gr.Number(label="LLM Temperature", value=1.0)
356
+ with gr.Row():
357
+ llm_base_url = gr.Textbox(label="LLM Base URL")
358
+ llm_api_key = gr.Textbox(label="LLM API Key", type="password")
359
+
360
+ with gr.Accordion("Browser Settings", open=False):
361
+ use_own_browser = gr.Checkbox(label="Use Own Browser", value=False)
362
+ headless = gr.Checkbox(label="Headless", value=False)
363
+ disable_security = gr.Checkbox(label="Disable Security", value=True)
364
+ with gr.Row():
365
+ window_w = gr.Number(label="Window Width", value=1920)
366
+ window_h = gr.Number(label="Window Height", value=1080)
367
+ save_recording_path = gr.Textbox(label="Save Recording Path", placeholder="e.g. ./tmp/record_videos",
368
+ value="./tmp/record_videos")
369
+ with gr.Accordion("Task Settings", open=True):
370
+ task = gr.Textbox(label="Task", lines=10,
371
+ value="go to google.com and type 'OpenAI' click search and give me the first url")
372
+ add_infos = gr.Textbox(label="Additional Infos(Optional): Hints to help LLM complete Task", lines=5)
373
+
374
+ run_button = gr.Button("Run Agent", variant="primary")
375
+ with gr.Column():
376
+ # Add live stream viewer before other components
377
+ browser_view = gr.HTML(
378
+ label="Live Browser View",
379
+ value="<div style='width:100%; height:600px; border:1px solid #ccc; display:flex; align-items:center; justify-content:center;'><p>Waiting for browser session...</p></div>"
380
+ )
381
+ final_result_output = gr.Textbox(label="Final Result", lines=5)
382
+ errors_output = gr.Textbox(label="Errors", lines=5)
383
+ model_actions_output = gr.Textbox(label="Model Actions", lines=5)
384
+ model_thoughts_output = gr.Textbox(label="Model Thoughts", lines=5)
385
+ with gr.Row():
386
+ recording_file = gr.Video(label="Recording File") # Changed from gr.File to gr.Video
387
+ trace_file = gr.File(label="Trace File (ZIP)")
388
+
389
+ # Add a refresh button
390
+ refresh_button = gr.Button("Refresh Files")
391
+
392
+ def refresh_files():
393
+ recorded_files = get_latest_files("./tmp/record_videos")
394
+ return (
395
+ recorded_files.get('.webm') if recorded_files.get('.webm') else None,
396
+ recorded_files.get('.zip') if recorded_files.get('.zip') else None
397
+ )
398
+
399
+ refresh_button.click(
400
+ fn=refresh_files,
401
+ inputs=[],
402
+ outputs=[recording_file, trace_file]
403
+ )
404
+
405
+ run_button.click(
406
+ fn=run_with_stream,
407
+ inputs=[
408
+ agent_type,
409
+ llm_provider,
410
+ llm_model_name,
411
+ llm_temperature,
412
+ llm_base_url,
413
+ llm_api_key,
414
+ use_own_browser,
415
+ headless,
416
+ disable_security,
417
+ window_w,
418
+ window_h,
419
+ save_recording_path,
420
+ task,
421
+ add_infos,
422
+ max_steps,
423
+ use_vision
424
+ ],
425
+ outputs=[ # Change from dict to list
426
+ browser_view,
427
+ final_result_output,
428
+ errors_output,
429
+ model_actions_output,
430
+ model_thoughts_output,
431
+ recording_file,
432
+ trace_file
433
+ ],
434
+ queue=True
435
+ )
436
+
437
+ demo.launch(server_name=args.ip, server_port=args.port, share=True)
438
+
439
+ if __name__ == "__main__":
440
+
441
+ # For local development
442
+ import argparse
443
+ parser = argparse.ArgumentParser(description="Gradio UI for Browser Agent")
444
+ parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to bind to")
445
+ parser.add_argument("--port", type=int, default=7788, help="Port to listen on")
446
+ args = parser.parse_args()
447
+ main()
448
+ else:
449
+ # For Vercel deployment
450
+ main()