Spaces:

Sanjeev23oct
/

browser-use-sg

Running

App Files Files Community

Sanjeev23oct commited on 8 days ago

Commit

f1d5e1c

verified ·

1 Parent(s): bf33afc

Upload folder using huggingface_hub

Browse files

Files changed (40) hide show

.dockerignore +2 -0
.env.example +56 -0
.gitattributes +1 -0
.github/workflows/update_space.yml +28 -0
.gitignore +189 -0
.vscode/settings.json +11 -0
Dockerfile +42 -0
Dockerfile.arm64 +85 -0
Dockerfile.railway +42 -0
LICENSE +21 -0
README.md +239 -12
SECURITY.md +19 -0
assets/examples/test.png +3 -0
assets/web-ui.png +0 -0
docker-compose.yml +59 -0
entrypoint.sh +4 -0
requirements.txt +7 -0
src/__init__.py +0 -0
src/agent/__init__.py +0 -0
src/agent/custom_agent.py +478 -0
src/agent/custom_message_manager.py +111 -0
src/agent/custom_prompts.py +125 -0
src/agent/custom_system_prompt.md +80 -0
src/agent/custom_views.py +67 -0
src/browser/__init__.py +0 -0
src/browser/custom_browser.py +28 -0
src/browser/custom_context.py +19 -0
src/controller/__init__.py +0 -0
src/controller/custom_controller.py +49 -0
src/utils/__init__.py +0 -0
src/utils/agent_state.py +31 -0
src/utils/deep_research.py +387 -0
src/utils/llm.py +138 -0
src/utils/utils.py +400 -0
supervisord.conf +96 -0
tests/test_browser_use.py +364 -0
tests/test_deep_research.py +30 -0
tests/test_llm_api.py +137 -0
tests/test_playwright.py +31 -0
webui.py +1203 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ data
2	+ tmp

.env.example ADDED Viewed

	@@ -0,0 +1,56 @@

+OPENAI_ENDPOINT=https://api.openai.com/v1
+OPENAI_API_KEY=
+ANTHROPIC_API_KEY=
+ANTHROPIC_ENDPOINT=https://api.anthropic.com
+GOOGLE_API_KEY=
+AZURE_OPENAI_ENDPOINT=
+AZURE_OPENAI_API_KEY=
+AZURE_OPENAI_API_VERSION=2025-01-01-preview
+DEEPSEEK_ENDPOINT=https://api.deepseek.com
+DEEPSEEK_API_KEY=
+MISTRAL_API_KEY=
+MISTRAL_ENDPOINT=https://api.mistral.ai/v1
+OLLAMA_ENDPOINT=http://localhost:11434
+ALIBABA_ENDPOINT=https://dashscope.aliyuncs.com/compatible-mode/v1
+ALIBABA_API_KEY=
+MOONSHOT_ENDPOINT=https://api.moonshot.cn/v1
+MOONSHOT_API_KEY=
+UNBOUND_ENDPOINT=https://api.getunbound.ai
+UNBOUND_API_KEY=
+SiliconFLOW_ENDPOINT=https://api.siliconflow.cn/v1/
+SiliconFLOW_API_KEY=
+# Set to false to disable anonymized telemetry
+ANONYMIZED_TELEMETRY=false
+# LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info
+BROWSER_USE_LOGGING_LEVEL=info
+# Chrome settings
+CHROME_PATH=
+CHROME_USER_DATA=
+CHROME_DEBUGGING_PORT=9222
+CHROME_DEBUGGING_HOST=localhost
+# Set to true to keep browser open between AI tasks
+CHROME_PERSISTENT_SESSION=false
+CHROME_CDP=
+# Display settings
+# Format: WIDTHxHEIGHTxDEPTH
+RESOLUTION=1920x1080x24
+# Width in pixels
+RESOLUTION_WIDTH=1920
+# Height in pixels
+RESOLUTION_HEIGHT=1080
+# VNC settings
+VNC_PASSWORD=youvncpassword

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/examples/test.png filter=lfs diff=lfs merge=lfs -text

.github/workflows/update_space.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+name: Run Python script
+on:
+  push:
+    branches:
+      - main
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.9'
+    - name: Install Gradio
+      run: python -m pip install gradio
+    - name: Log in to Hugging Face
+      run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
+    - name: Deploy to Spaces
+      run: gradio deploy

.gitignore ADDED Viewed

	@@ -0,0 +1,189 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+test_env/
+myenv
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+temp
+tmp
+.DS_Store
+private_example.py
+private_example
+browser_cookies.json
+cookies.json
+AgentHistory.json
+cv_04_24.pdf
+AgentHistoryList.json
+*.gif
+# For Sharing (.pem files)
+.gradio/
+# For Docker
+data/
+# For Config Files (Current Settings)
+.config.pkl

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "python.analysis.typeCheckingMode": "basic",
+  "[python]": {
+    "editor.defaultFormatter": "charliermarsh.ruff",
+    "editor.formatOnSave": true,
+    "editor.codeActionsOnSave": {
+      "source.fixAll.ruff": "explicit",
+      "source.organizeImports.ruff": "explicit"
+    }
+  }
+}

Dockerfile ADDED Viewed

	@@ -0,0 +1,42 @@

+# Build stage
+FROM python:3.11-slim as builder
+WORKDIR /app
+COPY requirements.txt .
+# Install dependencies in a virtual environment
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+RUN pip install --no-cache-dir -r requirements.txt
+# Runtime stage
+FROM python:3.11-slim
+# Install minimal runtime dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Copy virtual environment from builder
+COPY --from=builder /opt/venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+# Copy application code
+COPY . .
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV GRADIO_SERVER_NAME=0.0.0.0
+ENV GRADIO_SERVER_PORT=80
+# Health check
+HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:80 || exit 1
+# Expose port
+EXPOSE 80
+# Run the application
+CMD ["python", "webui.py"]

Dockerfile.arm64 ADDED Viewed

	@@ -0,0 +1,85 @@

+FROM python:3.11-slim
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    wget \
+    gnupg \
+    curl \
+    unzip \
+    xvfb \
+    libgconf-2-4 \
+    libxss1 \
+    libnss3 \
+    libnspr4 \
+    libasound2 \
+    libatk1.0-0 \
+    libatk-bridge2.0-0 \
+    libcups2 \
+    libdbus-1-3 \
+    libdrm2 \
+    libgbm1 \
+    libgtk-3-0 \
+    libxcomposite1 \
+    libxdamage1 \
+    libxfixes3 \
+    libxrandr2 \
+    xdg-utils \
+    fonts-liberation \
+    dbus \
+    xauth \
+    xvfb \
+    x11vnc \
+    tigervnc-tools \
+    supervisor \
+    net-tools \
+    procps \
+    git \
+    python3-numpy \
+    fontconfig \
+    fonts-dejavu \
+    fonts-dejavu-core \
+    fonts-dejavu-extra \
+    && rm -rf /var/lib/apt/lists/*
+# Install noVNC
+RUN git clone https://github.com/novnc/noVNC.git /opt/novnc \
+    && git clone https://github.com/novnc/websockify /opt/novnc/utils/websockify \
+    && ln -s /opt/novnc/vnc.html /opt/novnc/index.html
+# Set platform explicitly for ARM64
+ARG TARGETPLATFORM=linux/arm64
+# Set up working directory
+WORKDIR /app
+# Copy requirements and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Install Playwright and browsers with system dependencies optimized for ARM64
+ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
+RUN PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 pip install playwright && \
+    playwright install --with-deps chromium
+# Copy the application code
+COPY . .
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV BROWSER_USE_LOGGING_LEVEL=info
+ENV CHROME_PATH=/ms-playwright/chromium-*/chrome-linux/chrome
+ENV ANONYMIZED_TELEMETRY=false
+ENV DISPLAY=:99
+ENV RESOLUTION=1920x1080x24
+ENV VNC_PASSWORD=vncpassword
+ENV CHROME_PERSISTENT_SESSION=true
+ENV RESOLUTION_WIDTH=1920
+ENV RESOLUTION_HEIGHT=1080
+# Set up supervisor configuration
+RUN mkdir -p /var/log/supervisor
+COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
+EXPOSE 7788 6080 5901
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]

Dockerfile.railway ADDED Viewed

	@@ -0,0 +1,42 @@

+# Build stage
+FROM python:3.11-slim as builder
+WORKDIR /app
+COPY requirements.txt .
+# Install dependencies in a virtual environment
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+RUN pip install --no-cache-dir -r requirements.txt
+# Runtime stage
+FROM python:3.11-slim
+# Install minimal runtime dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Copy virtual environment from builder
+COPY --from=builder /opt/venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+# Copy application code
+COPY . .
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV GRADIO_SERVER_NAME=0.0.0.0
+ENV GRADIO_SERVER_PORT=80
+# Health check
+HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:80 || exit 1
+# Expose port
+EXPOSE 80
+# Run the application
+CMD ["python", "webui.py"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Browser Use Inc.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,239 @@
----
-title: Browser Use Sg
-emoji: 🚀
-colorFrom: blue
-colorTo: green
-sdk: gradio
-sdk_version: 5.25.2
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: browser-use-sg
+app_file: webui.py
+sdk: gradio
+sdk_version: 5.23.1
+---
+<img src="./assets/web-ui.png" alt="Browser Use Web UI" width="full"/>
+<br/>
+[![GitHub stars](https://img.shields.io/github/stars/browser-use/web-ui?style=social)](https://github.com/browser-use/web-ui/stargazers)
+[![Discord](https://img.shields.io/discord/1303749220842340412?color=7289DA&label=Discord&logo=discord&logoColor=white)](https://link.browser-use.com/discord)
+[![Documentation](https://img.shields.io/badge/Documentation-📕-blue)](https://docs.browser-use.com)
+[![WarmShao](https://img.shields.io/twitter/follow/warmshao?style=social)](https://x.com/warmshao)
+This project builds upon the foundation of the [browser-use](https://github.com/browser-use/browser-use), which is designed to make websites accessible for AI agents.
+We would like to officially thank [WarmShao](https://github.com/warmshao) for his contribution to this project.
+**WebUI:** is built on Gradio and supports most of `browser-use` functionalities. This UI is designed to be user-friendly and enables easy interaction with the browser agent.
+**Expanded LLM Support:** We've integrated support for various Large Language Models (LLMs), including: Google, OpenAI, Azure OpenAI, Anthropic, DeepSeek, Ollama etc. And we plan to add support for even more models in the future.
+**Custom Browser Support:** You can use your own browser with our tool, eliminating the need to re-login to sites or deal with other authentication challenges. This feature also supports high-definition screen recording.
+**Persistent Browser Sessions:** You can choose to keep the browser window open between AI tasks, allowing you to see the complete history and state of AI interactions.
+<video src="https://github.com/user-attachments/assets/56bc7080-f2e3-4367-af22-6bf2245ff6cb" controls="controls">Your browser does not support playing this video!</video>
+## Installation Guide
+### Prerequisites
+- Python 3.11 or higher
+- Git (for cloning the repository)
+### Option 1: Local Installation
+Read the [quickstart guide](https://docs.browser-use.com/quickstart#prepare-the-environment) or follow the steps below to get started.
+#### Step 1: Clone the Repository
+```bash
+git clone https://github.com/browser-use/web-ui.git
+cd web-ui
+```
+#### Step 2: Set Up Python Environment
+We recommend using [uv](https://docs.astral.sh/uv/) for managing the Python environment.
+Using uv (recommended):
+```bash
+uv venv --python 3.11
+```
+Activate the virtual environment:
+- Windows (Command Prompt):
+```cmd
+.venv\Scripts\activate
+```
+- Windows (PowerShell):
+```powershell
+.\.venv\Scripts\Activate.ps1
+```
+- macOS/Linux:
+```bash
+source .venv/bin/activate
+```
+#### Step 3: Install Dependencies
+Install Python packages:
+```bash
+uv pip install -r requirements.txt
+```
+Install Browsers in Playwright:
+You can install specific browsers by running:
+```bash
+playwright install --with-deps chromium
+```
+To install all browsers:
+```bash
+playwright install
+```
+#### Step 4: Configure Environment
+1. Create a copy of the example environment file:
+- Windows (Command Prompt):
+```bash
+copy .env.example .env
+```
+- macOS/Linux/Windows (PowerShell):
+```bash
+cp .env.example .env
+```
+2. Open `.env` in your preferred text editor and add your API keys and other settings
+### Option 2: Docker Installation
+#### Prerequisites
+- Docker and Docker Compose installed
+  - [Docker Desktop](https://www.docker.com/products/docker-desktop/) (For Windows/macOS)
+  - [Docker Engine](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/) (For Linux)
+#### Installation Steps
+1. Clone the repository:
+```bash
+git clone https://github.com/browser-use/web-ui.git
+cd web-ui
+```
+2. Create and configure environment file:
+- Windows (Command Prompt):
+```bash
+copy .env.example .env
+```
+- macOS/Linux/Windows (PowerShell):
+```bash
+cp .env.example .env
+```
+Edit `.env` with your preferred text editor and add your API keys
+3. Run with Docker:
+```bash
+# Build and start the container with default settings (browser closes after AI tasks)
+docker compose up --build
+```
+```bash
+# Or run with persistent browser (browser stays open between AI tasks)
+CHROME_PERSISTENT_SESSION=true docker compose up --build
+```
+4. Access the Application:
+- Web Interface: Open `http://localhost:7788` in your browser
+- VNC Viewer (for watching browser interactions): Open `http://localhost:6080/vnc.html`
+  - Default VNC password: "youvncpassword"
+  - Can be changed by setting `VNC_PASSWORD` in your `.env` file
+## Usage
+### Local Setup
+1.  **Run the WebUI:**
+    After completing the installation steps above, start the application:
+    ```bash
+    python webui.py --ip 127.0.0.1 --port 7788
+    ```
+2. WebUI options:
+   - `--ip`: The IP address to bind the WebUI to. Default is `127.0.0.1`.
+   - `--port`: The port to bind the WebUI to. Default is `7788`.
+   - `--theme`: The theme for the user interface. Default is `Ocean`.
+     - **Default**: The standard theme with a balanced design.
+     - **Soft**: A gentle, muted color scheme for a relaxed viewing experience.
+     - **Monochrome**: A grayscale theme with minimal color for simplicity and focus.
+     - **Glass**: A sleek, semi-transparent design for a modern appearance.
+     - **Origin**: A classic, retro-inspired theme for a nostalgic feel.
+     - **Citrus**: A vibrant, citrus-inspired palette with bright and fresh colors.
+     - **Ocean** (default): A blue, ocean-inspired theme providing a calming effect.
+   - `--dark-mode`: Enables dark mode for the user interface.
+3.  **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`.
+4.  **Using Your Own Browser(Optional):**
+    - Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser. Leave `CHROME_USER_DATA` empty if you want to use local user data.
+      - Windows
+        ```env
+         CHROME_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe"
+         CHROME_USER_DATA="C:\Users\YourUsername\AppData\Local\Google\Chrome\User Data"
+        ```
+        > Note: Replace `YourUsername` with your actual Windows username for Windows systems.
+      - Mac
+        ```env
+         CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
+         CHROME_USER_DATA="/Users/YourUsername/Library/Application Support/Google/Chrome"
+        ```
+    - Close all Chrome windows
+    - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent.
+    - Check the "Use Own Browser" option within the Browser Settings.
+5. **Keep Browser Open(Optional):**
+    - Set `CHROME_PERSISTENT_SESSION=true` in the `.env` file.
+### Docker Setup
+1. **Environment Variables:**
+   - All configuration is done through the `.env` file
+   - Available environment variables:
+     ```
+     # LLM API Keys
+     OPENAI_API_KEY=your_key_here
+     ANTHROPIC_API_KEY=your_key_here
+     GOOGLE_API_KEY=your_key_here
+     # Browser Settings
+     CHROME_PERSISTENT_SESSION=true   # Set to true to keep browser open between AI tasks
+     RESOLUTION=1920x1080x24         # Custom resolution format: WIDTHxHEIGHTxDEPTH
+     RESOLUTION_WIDTH=1920           # Custom width in pixels
+     RESOLUTION_HEIGHT=1080          # Custom height in pixels
+     # VNC Settings
+     VNC_PASSWORD=your_vnc_password  # Optional, defaults to "vncpassword"
+     ```
+2. **Platform Support:**
+   - Supports both AMD64 and ARM64 architectures
+   - For ARM64 systems (e.g., Apple Silicon Macs), the container will automatically use the appropriate image
+3. **Browser Persistence Modes:**
+   - **Default Mode (CHROME_PERSISTENT_SESSION=false):**
+     - Browser opens and closes with each AI task
+     - Clean state for each interaction
+     - Lower resource usage
+   - **Persistent Mode (CHROME_PERSISTENT_SESSION=true):**
+     - Browser stays open between AI tasks
+     - Maintains history and state
+     - Allows viewing previous AI interactions
+     - Set in `.env` file or via environment variable when starting container
+4. **Viewing Browser Interactions:**
+   - Access the noVNC viewer at `http://localhost:6080/vnc.html`
+   - Enter the VNC password (default: "vncpassword" or what you set in VNC_PASSWORD)
+   - Direct VNC access available on port 5900 (mapped to container port 5901)
+   - You can now see all browser interactions in real-time
+5. **Container Management:**
+   ```bash
+   # Start with persistent browser
+   CHROME_PERSISTENT_SESSION=true docker compose up -d
+   # Start with default mode (browser closes after tasks)
+   docker compose up -d
+   # View logs
+   docker compose logs -f
+   # Stop the container
+   docker compose down
+   ```
+## Changelog
+- [x] **2025/01/26:** Thanks to @vvincent1234. Now browser-use-webui can combine with DeepSeek-r1 to engage in deep thinking!
+- [x] **2025/01/10:** Thanks to @casistack. Now we have Docker Setup option and also Support keep browser open between tasks.[Video tutorial demo](https://github.com/browser-use/web-ui/issues/1#issuecomment-2582511750).
+- [x] **2025/01/06:** Thanks to @richard-devbot. A New and Well-Designed WebUI is released. [Video tutorial demo](https://github.com/warmshao/browser-use-webui/issues/1#issuecomment-2573393113).

SECURITY.md ADDED Viewed

	@@ -0,0 +1,19 @@

+## Reporting Security Issues
+If you believe you have found a security vulnerability in browser-use, please report it through coordinated disclosure.
+**Please do not report security vulnerabilities through the repository issues, discussions, or pull requests.**
+Instead, please open a new [Github security advisory](https://github.com/browser-use/web-ui/security/advisories/new).
+Please include as much of the information listed below as you can to help me better understand and resolve the issue:
+* The type of issue (e.g., buffer overflow, SQL injection, or cross-site scripting)
+* Full paths of source file(s) related to the manifestation of the issue
+* The location of the affected source code (tag/branch/commit or direct URL)
+* Any special configuration required to reproduce the issue
+* Step-by-step instructions to reproduce the issue
+* Proof-of-concept or exploit code (if possible)
+* Impact of the issue, including how an attacker might exploit the issue
+This information will help me triage your report more quickly.

assets/examples/test.png ADDED Viewed

Git LFS Details

SHA256: 23e4fe8c9836cd35393315a3cca074dbd55a8645289ea337e3300269dda06900
Pointer size: 131 Bytes
Size of remote file: 423 kB

assets/web-ui.png ADDED Viewed

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,59 @@

+services:
+  browser-use-webui:
+    platform: linux/amd64
+    build:
+      context: .
+      dockerfile: ${DOCKERFILE:-Dockerfile}
+      args:
+        TARGETPLATFORM: ${TARGETPLATFORM:-linux/amd64}
+    ports:
+      - "7788:7788"  # Gradio default port
+      - "6080:6080"  # noVNC web interface
+      - "5901:5901"  # VNC port
+      - "9222:9222"  # Chrome remote debugging port
+    environment:
+      - OPENAI_ENDPOINT=${OPENAI_ENDPOINT:-https://api.openai.com/v1}
+      - OPENAI_API_KEY=${OPENAI_API_KEY:-}
+      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
+      - ANTHROPIC_ENDPOINT=${ANTHROPIC_ENDPOINT:-https://api.anthropic.com}
+      - GOOGLE_API_KEY=${GOOGLE_API_KEY:-}
+      - AZURE_OPENAI_ENDPOINT=${AZURE_OPENAI_ENDPOINT:-}
+      - AZURE_OPENAI_API_KEY=${AZURE_OPENAI_API_KEY:-}
+      - DEEPSEEK_ENDPOINT=${DEEPSEEK_ENDPOINT:-https://api.deepseek.com}
+      - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
+      - OLLAMA_ENDPOINT=${OLLAMA_ENDPOINT:-http://localhost:11434}
+      - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
+      - MISTRAL_ENDPOINT=${MISTRAL_ENDPOINT:-https://api.mistral.ai/v1}
+      - ALIBABA_ENDPOINT=${ALIBABA_ENDPOINT:-https://dashscope.aliyuncs.com/compatible-mode/v1}
+      - ALIBABA_API_KEY=${ALIBABA_API_KEY:-}
+      - MOONSHOT_ENDPOINT=${MOONSHOT_ENDPOINT:-https://api.moonshot.cn/v1}
+      - MOONSHOT_API_KEY=${MOONSHOT_API_KEY:-}
+      - BROWSER_USE_LOGGING_LEVEL=${BROWSER_USE_LOGGING_LEVEL:-info}
+      - ANONYMIZED_TELEMETRY=${ANONYMIZED_TELEMETRY:-false}
+      - CHROME_PATH=/usr/bin/google-chrome
+      - CHROME_USER_DATA=/app/data/chrome_data
+      - CHROME_PERSISTENT_SESSION=${CHROME_PERSISTENT_SESSION:-false}
+      - CHROME_CDP=${CHROME_CDP:-http://localhost:9222}
+      - DISPLAY=:99
+      - PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
+      - RESOLUTION=${RESOLUTION:-1920x1080x24}
+      - RESOLUTION_WIDTH=${RESOLUTION_WIDTH:-1920}
+      - RESOLUTION_HEIGHT=${RESOLUTION_HEIGHT:-1080}
+      - VNC_PASSWORD=${VNC_PASSWORD:-vncpassword}
+      - CHROME_DEBUGGING_PORT=9222
+      - CHROME_DEBUGGING_HOST=localhost
+    volumes:
+      - /tmp/.X11-unix:/tmp/.X11-unix
+    restart: unless-stopped
+    shm_size: '2gb'
+    cap_add:
+      - SYS_ADMIN
+    security_opt:
+      - seccomp=unconfined
+    tmpfs:
+      - /tmp
+    healthcheck:
+      test: ["CMD", "nc", "-z", "localhost", "5901"]
+      interval: 10s
+      timeout: 5s
+      retries: 3

entrypoint.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+#!/bin/bash
+# Start supervisord in the foreground to properly manage child processes
+exec /usr/bin/supervisord -n -c /etc/supervisor/conf.d/supervisord.conf

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+browser-use==0.1.40
+pyperclip==1.9.0
+gradio==5.23.1
+json-repair
+langchain-mistralai==0.2.4
+langchain-google-genai==2.0.8
+MainContentExtractor==0.0.4

src/__init__.py ADDED Viewed

File without changes

src/agent/__init__.py ADDED Viewed

File without changes

src/agent/custom_agent.py ADDED Viewed

	@@ -0,0 +1,478 @@

+import json
+import logging
+import pdb
+import traceback
+from typing import Any, Awaitable, Callable, Dict, Generic, List, Optional, Type, TypeVar
+from PIL import Image, ImageDraw, ImageFont
+import os
+import base64
+import io
+import asyncio
+import time
+import platform
+from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
+from browser_use.agent.service import Agent
+from browser_use.agent.message_manager.utils import convert_input_messages, extract_json_from_model_output, \
+    save_conversation
+from browser_use.agent.views import (
+    ActionResult,
+    AgentError,
+    AgentHistory,
+    AgentHistoryList,
+    AgentOutput,
+    AgentSettings,
+    AgentState,
+    AgentStepInfo,
+    StepMetadata,
+    ToolCallingMethod,
+)
+from browser_use.agent.gif import create_history_gif
+from browser_use.browser.browser import Browser
+from browser_use.browser.context import BrowserContext
+from browser_use.browser.views import BrowserStateHistory
+from browser_use.controller.service import Controller
+from browser_use.telemetry.views import (
+    AgentEndTelemetryEvent,
+    AgentRunTelemetryEvent,
+    AgentStepTelemetryEvent,
+)
+from browser_use.utils import time_execution_async
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.messages import (
+    BaseMessage,
+    HumanMessage,
+    AIMessage
+)
+from browser_use.browser.views import BrowserState, BrowserStateHistory
+from browser_use.agent.prompts import PlannerPrompt
+from json_repair import repair_json
+from src.utils.agent_state import AgentState
+from .custom_message_manager import CustomMessageManager, CustomMessageManagerSettings
+from .custom_views import CustomAgentOutput, CustomAgentStepInfo, CustomAgentState
+logger = logging.getLogger(__name__)
+Context = TypeVar('Context')
+class CustomAgent(Agent):
+    def __init__(
+            self,
+            task: str,
+            llm: BaseChatModel,
+            add_infos: str = "",
+            # Optional parameters
+            browser: Browser | None = None,
+            browser_context: BrowserContext | None = None,
+            controller: Controller[Context] = Controller(),
+            # Initial agent run parameters
+            sensitive_data: Optional[Dict[str, str]] = None,
+            initial_actions: Optional[List[Dict[str, Dict[str, Any]]]] = None,
+            # Cloud Callbacks
+            register_new_step_callback: Callable[['BrowserState', 'AgentOutput', int], Awaitable[None]] | None = None,
+            register_done_callback: Callable[['AgentHistoryList'], Awaitable[None]] | None = None,
+            register_external_agent_status_raise_error_callback: Callable[[], Awaitable[bool]] | None = None,
+            # Agent settings
+            use_vision: bool = True,
+            use_vision_for_planner: bool = False,
+            save_conversation_path: Optional[str] = None,
+            save_conversation_path_encoding: Optional[str] = 'utf-8',
+            max_failures: int = 3,
+            retry_delay: int = 10,
+            system_prompt_class: Type[SystemPrompt] = SystemPrompt,
+            agent_prompt_class: Type[AgentMessagePrompt] = AgentMessagePrompt,
+            max_input_tokens: int = 128000,
+            validate_output: bool = False,
+            message_context: Optional[str] = None,
+            generate_gif: bool | str = False,
+            available_file_paths: Optional[list[str]] = None,
+            include_attributes: list[str] = [
+                'title',
+                'type',
+                'name',
+                'role',
+                'aria-label',
+                'placeholder',
+                'value',
+                'alt',
+                'aria-expanded',
+                'data-date-format',
+            ],
+            max_actions_per_step: int = 10,
+            tool_calling_method: Optional[ToolCallingMethod] = 'auto',
+            page_extraction_llm: Optional[BaseChatModel] = None,
+            planner_llm: Optional[BaseChatModel] = None,
+            planner_interval: int = 1,  # Run planner every N steps
+            # Inject state
+            injected_agent_state: Optional[AgentState] = None,
+            context: Context | None = None,
+    ):
+        super(CustomAgent, self).__init__(
+            task=task,
+            llm=llm,
+            browser=browser,
+            browser_context=browser_context,
+            controller=controller,
+            sensitive_data=sensitive_data,
+            initial_actions=initial_actions,
+            register_new_step_callback=register_new_step_callback,
+            register_done_callback=register_done_callback,
+            register_external_agent_status_raise_error_callback=register_external_agent_status_raise_error_callback,
+            use_vision=use_vision,
+            use_vision_for_planner=use_vision_for_planner,
+            save_conversation_path=save_conversation_path,
+            save_conversation_path_encoding=save_conversation_path_encoding,
+            max_failures=max_failures,
+            retry_delay=retry_delay,
+            system_prompt_class=system_prompt_class,
+            max_input_tokens=max_input_tokens,
+            validate_output=validate_output,
+            message_context=message_context,
+            generate_gif=generate_gif,
+            available_file_paths=available_file_paths,
+            include_attributes=include_attributes,
+            max_actions_per_step=max_actions_per_step,
+            tool_calling_method=tool_calling_method,
+            page_extraction_llm=page_extraction_llm,
+            planner_llm=planner_llm,
+            planner_interval=planner_interval,
+            injected_agent_state=injected_agent_state,
+            context=context,
+        )
+        self.state = injected_agent_state or CustomAgentState()
+        self.add_infos = add_infos
+        self._message_manager = CustomMessageManager(
+            task=task,
+            system_message=self.settings.system_prompt_class(
+                self.available_actions,
+                max_actions_per_step=self.settings.max_actions_per_step,
+            ).get_system_message(),
+            settings=CustomMessageManagerSettings(
+                max_input_tokens=self.settings.max_input_tokens,
+                include_attributes=self.settings.include_attributes,
+                message_context=self.settings.message_context,
+                sensitive_data=sensitive_data,
+                available_file_paths=self.settings.available_file_paths,
+                agent_prompt_class=agent_prompt_class
+            ),
+            state=self.state.message_manager_state,
+        )
+    def _log_response(self, response: CustomAgentOutput) -> None:
+        """Log the model's response"""
+        if "Success" in response.current_state.evaluation_previous_goal:
+            emoji = "✅"
+        elif "Failed" in response.current_state.evaluation_previous_goal:
+            emoji = "❌"
+        else:
+            emoji = "🤷"
+        logger.info(f"{emoji} Eval: {response.current_state.evaluation_previous_goal}")
+        logger.info(f"🧠 New Memory: {response.current_state.important_contents}")
+        logger.info(f"🤔 Thought: {response.current_state.thought}")
+        logger.info(f"🎯 Next Goal: {response.current_state.next_goal}")
+        for i, action in enumerate(response.action):
+            logger.info(
+                f"🛠️  Action {i + 1}/{len(response.action)}: {action.model_dump_json(exclude_unset=True)}"
+            )
+    def _setup_action_models(self) -> None:
+        """Setup dynamic action models from controller's registry"""
+        # Get the dynamic action model from controller's registry
+        self.ActionModel = self.controller.registry.create_action_model()
+        # Create output model with the dynamic actions
+        self.AgentOutput = CustomAgentOutput.type_with_custom_actions(self.ActionModel)
+    def update_step_info(
+            self, model_output: CustomAgentOutput, step_info: CustomAgentStepInfo = None
+    ):
+        """
+        update step info
+        """
+        if step_info is None:
+            return
+        step_info.step_number += 1
+        important_contents = model_output.current_state.important_contents
+        if (
+                important_contents
+                and "None" not in important_contents
+                and important_contents not in step_info.memory
+        ):
+            step_info.memory += important_contents + "\n"
+        logger.info(f"🧠 All Memory: \n{step_info.memory}")
+    @time_execution_async("--get_next_action")
+    async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput:
+        """Get next action from LLM based on current state"""
+        fixed_input_messages = self._convert_input_messages(input_messages)
+        ai_message = self.llm.invoke(fixed_input_messages)
+        self.message_manager._add_message_with_tokens(ai_message)
+        if hasattr(ai_message, "reasoning_content"):
+            logger.info("🤯 Start Deep Thinking: ")
+            logger.info(ai_message.reasoning_content)
+            logger.info("🤯 End Deep Thinking")
+        if isinstance(ai_message.content, list):
+            ai_content = ai_message.content[0]
+        else:
+            ai_content = ai_message.content
+        try:
+            ai_content = ai_content.replace("```json", "").replace("```", "")
+            ai_content = repair_json(ai_content)
+            parsed_json = json.loads(ai_content)
+            parsed: AgentOutput = self.AgentOutput(**parsed_json)
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            logger.debug(ai_message.content)
+            raise ValueError('Could not parse response.')
+        if parsed is None:
+            logger.debug(ai_message.content)
+            raise ValueError('Could not parse response.')
+        # cut the number of actions to max_actions_per_step if needed
+        if len(parsed.action) > self.settings.max_actions_per_step:
+            parsed.action = parsed.action[: self.settings.max_actions_per_step]
+        self._log_response(parsed)
+        return parsed
+    async def _run_planner(self) -> Optional[str]:
+        """Run the planner to analyze state and suggest next steps"""
+        # Skip planning if no planner_llm is set
+        if not self.settings.planner_llm:
+            return None
+        # Create planner message history using full message history
+        planner_messages = [
+            PlannerPrompt(self.controller.registry.get_prompt_description()).get_system_message(),
+            *self.message_manager.get_messages()[1:],  # Use full message history except the first
+        ]
+        if not self.settings.use_vision_for_planner and self.settings.use_vision:
+            last_state_message: HumanMessage = planner_messages[-1]
+            # remove image from last state message
+            new_msg = ''
+            if isinstance(last_state_message.content, list):
+                for msg in last_state_message.content:
+                    if msg['type'] == 'text':
+                        new_msg += msg['text']
+                    elif msg['type'] == 'image_url':
+                        continue
+            else:
+                new_msg = last_state_message.content
+            planner_messages[-1] = HumanMessage(content=new_msg)
+        # Get planner output
+        response = await self.settings.planner_llm.ainvoke(planner_messages)
+        plan = str(response.content)
+        last_state_message = self.message_manager.get_messages()[-1]
+        if isinstance(last_state_message, HumanMessage):
+            # remove image from last state message
+            if isinstance(last_state_message.content, list):
+                for msg in last_state_message.content:
+                    if msg['type'] == 'text':
+                        msg['text'] += f"\nPlanning Agent outputs plans:\n {plan}\n"
+            else:
+                last_state_message.content += f"\nPlanning Agent outputs plans:\n {plan}\n "
+        try:
+            plan_json = json.loads(plan.replace("```json", "").replace("```", ""))
+            logger.info(f'📋 Plans:\n{json.dumps(plan_json, indent=4)}')
+            if hasattr(response, "reasoning_content"):
+                logger.info("🤯 Start Planning Deep Thinking: ")
+                logger.info(response.reasoning_content)
+                logger.info("🤯 End Planning Deep Thinking")
+        except json.JSONDecodeError:
+            logger.info(f'📋 Plans:\n{plan}')
+        except Exception as e:
+            logger.debug(f'Error parsing planning analysis: {e}')
+            logger.info(f'📋 Plans: {plan}')
+        return plan
+    @time_execution_async("--step")
+    async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
+        """Execute one step of the task"""
+        logger.info(f"\n📍 Step {self.state.n_steps}")
+        state = None
+        model_output = None
+        result: list[ActionResult] = []
+        step_start_time = time.time()
+        tokens = 0
+        try:
+            state = await self.browser_context.get_state()
+            await self._raise_if_stopped_or_paused()
+            self.message_manager.add_state_message(state, self.state.last_action, self.state.last_result, step_info,
+                                                   self.settings.use_vision)
+            # Run planner at specified intervals if planner is configured
+            if self.settings.planner_llm and self.state.n_steps % self.settings.planner_interval == 0:
+                await self._run_planner()
+            input_messages = self.message_manager.get_messages()
+            tokens = self._message_manager.state.history.current_tokens
+            try:
+                model_output = await self.get_next_action(input_messages)
+                self.update_step_info(model_output, step_info)
+                self.state.n_steps += 1
+                if self.register_new_step_callback:
+                    await self.register_new_step_callback(state, model_output, self.state.n_steps)
+                if self.settings.save_conversation_path:
+                    target = self.settings.save_conversation_path + f'_{self.state.n_steps}.txt'
+                    save_conversation(input_messages, model_output, target,
+                                      self.settings.save_conversation_path_encoding)
+                if self.model_name != "deepseek-reasoner":
+                    # remove prev message
+                    self.message_manager._remove_state_message_by_index(-1)
+                await self._raise_if_stopped_or_paused()
+            except Exception as e:
+                # model call failed, remove last state message from history
+                self.message_manager._remove_state_message_by_index(-1)
+                raise e
+            result: list[ActionResult] = await self.multi_act(model_output.action)
+            for ret_ in result:
+                if ret_.extracted_content and "Extracted page" in ret_.extracted_content:
+                    # record every extracted page
+                    if ret_.extracted_content[:100] not in self.state.extracted_content:
+                        self.state.extracted_content += ret_.extracted_content
+            self.state.last_result = result
+            self.state.last_action = model_output.action
+            if len(result) > 0 and result[-1].is_done:
+                if not self.state.extracted_content:
+                    self.state.extracted_content = step_info.memory
+                result[-1].extracted_content = self.state.extracted_content
+                logger.info(f"📄 Result: {result[-1].extracted_content}")
+            self.state.consecutive_failures = 0
+        except InterruptedError:
+            logger.debug('Agent paused')
+            self.state.last_result = [
+                ActionResult(
+                    error='The agent was paused - now continuing actions might need to be repeated',
+                    include_in_memory=True
+                )
+            ]
+            return
+        except Exception as e:
+            result = await self._handle_step_error(e)
+            self.state.last_result = result
+        finally:
+            step_end_time = time.time()
+            actions = [a.model_dump(exclude_unset=True) for a in model_output.action] if model_output else []
+            self.telemetry.capture(
+                AgentStepTelemetryEvent(
+                    agent_id=self.state.agent_id,
+                    step=self.state.n_steps,
+                    actions=actions,
+                    consecutive_failures=self.state.consecutive_failures,
+                    step_error=[r.error for r in result if r.error] if result else ['No result'],
+                )
+            )
+            if not result:
+                return
+            if state:
+                metadata = StepMetadata(
+                    step_number=self.state.n_steps,
+                    step_start_time=step_start_time,
+                    step_end_time=step_end_time,
+                    input_tokens=tokens,
+                )
+                self._make_history_item(model_output, state, result, metadata)
+    async def run(self, max_steps: int = 100) -> AgentHistoryList:
+        """Execute the task with maximum number of steps"""
+        try:
+            self._log_agent_run()
+            # Execute initial actions if provided
+            if self.initial_actions:
+                result = await self.multi_act(self.initial_actions, check_for_new_elements=False)
+                self.state.last_result = result
+            step_info = CustomAgentStepInfo(
+                task=self.task,
+                add_infos=self.add_infos,
+                step_number=1,
+                max_steps=max_steps,
+                memory="",
+            )
+            for step in range(max_steps):
+                # Check if we should stop due to too many failures
+                if self.state.consecutive_failures >= self.settings.max_failures:
+                    logger.error(f'❌ Stopping due to {self.settings.max_failures} consecutive failures')
+                    break
+                # Check control flags before each step
+                if self.state.stopped:
+                    logger.info('Agent stopped')
+                    break
+                while self.state.paused:
+                    await asyncio.sleep(0.2)  # Small delay to prevent CPU spinning
+                    if self.state.stopped:  # Allow stopping while paused
+                        break
+                await self.step(step_info)
+                if self.state.history.is_done():
+                    if self.settings.validate_output and step < max_steps - 1:
+                        if not await self._validate_output():
+                            continue
+                    await self.log_completion()
+                    break
+            else:
+                logger.info("❌ Failed to complete task in maximum steps")
+                if not self.state.extracted_content:
+                    self.state.history.history[-1].result[-1].extracted_content = step_info.memory
+                else:
+                    self.state.history.history[-1].result[-1].extracted_content = self.state.extracted_content
+            return self.state.history
+        finally:
+            self.telemetry.capture(
+                AgentEndTelemetryEvent(
+                    agent_id=self.state.agent_id,
+                    is_done=self.state.history.is_done(),
+                    success=self.state.history.is_successful(),
+                    steps=self.state.n_steps,
+                    max_steps_reached=self.state.n_steps >= max_steps,
+                    errors=self.state.history.errors(),
+                    total_input_tokens=self.state.history.total_input_tokens(),
+                    total_duration_seconds=self.state.history.total_duration_seconds(),
+                )
+            )
+            if not self.injected_browser_context:
+                await self.browser_context.close()
+            if not self.injected_browser and self.browser:
+                await self.browser.close()
+            if self.settings.generate_gif:
+                output_path: str = 'agent_history.gif'
+                if isinstance(self.settings.generate_gif, str):
+                    output_path = self.settings.generate_gif
+                create_history_gif(task=self.task, history=self.state.history, output_path=output_path)

src/agent/custom_message_manager.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from __future__ import annotations
+import logging
+import pdb
+from typing import List, Optional, Type, Dict
+from browser_use.agent.message_manager.service import MessageManager
+from browser_use.agent.message_manager.views import MessageHistory
+from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
+from browser_use.agent.views import ActionResult, AgentStepInfo, ActionModel
+from browser_use.browser.views import BrowserState
+from browser_use.agent.message_manager.service import MessageManagerSettings
+from browser_use.agent.views import ActionResult, AgentOutput, AgentStepInfo, MessageManagerState
+from langchain_core.language_models import BaseChatModel
+from langchain_anthropic import ChatAnthropic
+from langchain_core.language_models import BaseChatModel
+from langchain_core.messages import (
+    AIMessage,
+    BaseMessage,
+    HumanMessage,
+    ToolMessage,
+    SystemMessage
+)
+from langchain_openai import ChatOpenAI
+from ..utils.llm import DeepSeekR1ChatOpenAI
+from .custom_prompts import CustomAgentMessagePrompt
+logger = logging.getLogger(__name__)
+class CustomMessageManagerSettings(MessageManagerSettings):
+    agent_prompt_class: Type[AgentMessagePrompt] = AgentMessagePrompt
+class CustomMessageManager(MessageManager):
+    def __init__(
+            self,
+            task: str,
+            system_message: SystemMessage,
+            settings: MessageManagerSettings = MessageManagerSettings(),
+            state: MessageManagerState = MessageManagerState(),
+    ):
+        super().__init__(
+            task=task,
+            system_message=system_message,
+            settings=settings,
+            state=state
+        )
+    def _init_messages(self) -> None:
+        """Initialize the message history with system message, context, task, and other initial messages"""
+        self._add_message_with_tokens(self.system_prompt)
+        self.context_content = ""
+        if self.settings.message_context:
+            self.context_content += 'Context for the task' + self.settings.message_context
+        if self.settings.sensitive_data:
+            info = f'Here are placeholders for sensitive data: {list(self.settings.sensitive_data.keys())}'
+            info += 'To use them, write <secret>the placeholder name</secret>'
+            self.context_content += info
+        if self.settings.available_file_paths:
+            filepaths_msg = f'Here are file paths you can use: {self.settings.available_file_paths}'
+            self.context_content += filepaths_msg
+        if self.context_content:
+            context_message = HumanMessage(content=self.context_content)
+            self._add_message_with_tokens(context_message)
+    def cut_messages(self):
+        """Get current message list, potentially trimmed to max tokens"""
+        diff = self.state.history.current_tokens - self.settings.max_input_tokens
+        min_message_len = 2 if self.context_content is not None else 1
+        while diff > 0 and len(self.state.history.messages) > min_message_len:
+            msg = self.state.history.messages.pop(min_message_len)
+            self.state.history.current_tokens -= msg.metadata.tokens
+            diff = self.state.history.current_tokens - self.settings.max_input_tokens
+    def add_state_message(
+            self,
+            state: BrowserState,
+            actions: Optional[List[ActionModel]] = None,
+            result: Optional[List[ActionResult]] = None,
+            step_info: Optional[AgentStepInfo] = None,
+            use_vision=True,
+    ) -> None:
+        """Add browser state as human message"""
+        # otherwise add state message and result to next message (which will not stay in memory)
+        state_message = self.settings.agent_prompt_class(
+            state,
+            actions,
+            result,
+            include_attributes=self.settings.include_attributes,
+            step_info=step_info,
+        ).get_user_message(use_vision)
+        self._add_message_with_tokens(state_message)
+    def _remove_state_message_by_index(self, remove_ind=-1) -> None:
+        """Remove state message by index from history"""
+        i = len(self.state.history.messages) - 1
+        remove_cnt = 0
+        while i >= 0:
+            if isinstance(self.state.history.messages[i].message, HumanMessage):
+                remove_cnt += 1
+            if remove_cnt == abs(remove_ind):
+                msg = self.state.history.messages.pop(i)
+                self.state.history.current_tokens -= msg.metadata.tokens
+                break
+            i -= 1

src/agent/custom_prompts.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import pdb
+from typing import List, Optional
+from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
+from browser_use.agent.views import ActionResult, ActionModel
+from browser_use.browser.views import BrowserState
+from langchain_core.messages import HumanMessage, SystemMessage
+from datetime import datetime
+import importlib
+from .custom_views import CustomAgentStepInfo
+class CustomSystemPrompt(SystemPrompt):
+    def _load_prompt_template(self) -> None:
+        """Load the prompt template from the markdown file."""
+        try:
+            # This works both in development and when installed as a package
+            with importlib.resources.files('src.agent').joinpath('custom_system_prompt.md').open('r') as f:
+                self.prompt_template = f.read()
+        except Exception as e:
+            raise RuntimeError(f'Failed to load system prompt template: {e}')
+    def get_system_message(self) -> SystemMessage:
+        """
+        Get the system prompt for the agent.
+        Returns:
+            SystemMessage: Formatted system prompt
+        """
+        prompt = self.prompt_template.format(max_actions=self.max_actions_per_step,
+                                             available_actions=self.default_action_description)
+        return SystemMessage(content=prompt)
+class CustomAgentMessagePrompt(AgentMessagePrompt):
+    def __init__(
+            self,
+            state: BrowserState,
+            actions: Optional[List[ActionModel]] = None,
+            result: Optional[List[ActionResult]] = None,
+            include_attributes: list[str] = [],
+            step_info: Optional[CustomAgentStepInfo] = None,
+    ):
+        super(CustomAgentMessagePrompt, self).__init__(state=state,
+                                                       result=result,
+                                                       include_attributes=include_attributes,
+                                                       step_info=step_info
+                                                       )
+        self.actions = actions
+    def get_user_message(self, use_vision: bool = True) -> HumanMessage:
+        if self.step_info:
+            step_info_description = f'Current step: {self.step_info.step_number}/{self.step_info.max_steps}\n'
+        else:
+            step_info_description = ''
+        time_str = datetime.now().strftime("%Y-%m-%d %H:%M")
+        step_info_description += f"Current date and time: {time_str}"
+        elements_text = self.state.element_tree.clickable_elements_to_string(include_attributes=self.include_attributes)
+        has_content_above = (self.state.pixels_above or 0) > 0
+        has_content_below = (self.state.pixels_below or 0) > 0
+        if elements_text != '':
+            if has_content_above:
+                elements_text = (
+                    f'... {self.state.pixels_above} pixels above - scroll or extract content to see more ...\n{elements_text}'
+                )
+            else:
+                elements_text = f'[Start of page]\n{elements_text}'
+            if has_content_below:
+                elements_text = (
+                    f'{elements_text}\n... {self.state.pixels_below} pixels below - scroll or extract content to see more ...'
+                )
+            else:
+                elements_text = f'{elements_text}\n[End of page]'
+        else:
+            elements_text = 'empty page'
+        state_description = f"""
+{step_info_description}
+1. Task: {self.step_info.task}.
+2. Hints(Optional):
+{self.step_info.add_infos}
+3. Memory:
+{self.step_info.memory}
+4. Current url: {self.state.url}
+5. Available tabs:
+{self.state.tabs}
+6. Interactive elements:
+{elements_text}
+        """
+        if self.actions and self.result:
+            state_description += "\n **Previous Actions** \n"
+            state_description += f'Previous step: {self.step_info.step_number - 1}/{self.step_info.max_steps} \n'
+            for i, result in enumerate(self.result):
+                action = self.actions[i]
+                state_description += f"Previous action {i + 1}/{len(self.result)}: {action.model_dump_json(exclude_unset=True)}\n"
+                if result.error:
+                    # only use last 300 characters of error
+                    error = result.error.split('\n')[-1]
+                    state_description += (
+                        f"Error of previous action {i + 1}/{len(self.result)}: ...{error}\n"
+                    )
+                if result.include_in_memory:
+                    if result.extracted_content:
+                        state_description += f"Result of previous action {i + 1}/{len(self.result)}: {result.extracted_content}\n"
+        if self.state.screenshot and use_vision == True:
+            # Format message for vision model
+            return HumanMessage(
+                content=[
+                    {'type': 'text', 'text': state_description},
+                    {
+                        'type': 'image_url',
+                        'image_url': {'url': f'data:image/png;base64,{self.state.screenshot}'},
+                    },
+                ]
+            )
+        return HumanMessage(content=state_description)

src/agent/custom_system_prompt.md ADDED Viewed

	@@ -0,0 +1,80 @@

+You are an AI agent designed to automate browser tasks. Your goal is to accomplish the ultimate task following the rules.
+# Input Format
+Task
+Previous steps
+Current URL
+Open Tabs
+Interactive Elements
+[index]<type>text</type>
+- index: Numeric identifier for interaction
+- type: HTML element type (button, input, etc.)
+- text: Element description
+Example:
+[33]<button>Submit Form</button>
+- Only elements with numeric indexes in [] are interactive
+- elements without [] provide only context
+# Response Rules
+1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
+{{
+ "current_state": {{
+   "evaluation_previous_goal": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Mention if something unexpected happened. Shortly state why/why not.",
+   "important_contents": "Output important contents closely related to user\'s instruction on the current page. If there is, please output the contents. If not, please output empty string ''.",
+   "thought": "Think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. If your output of evaluation_previous_goal is 'Failed', please reflect and output your reflection here.",
+   "next_goal": "Please generate a brief natural language description for the goal of your next actions based on your thought."
+ }},
+ "action": [
+   {{"one_action_name": {{// action-specific parameter}}}}, // ... more actions in sequence
+ ]
+}}
+2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {max_actions} actions per sequence.
+Common action sequences:
+- Form filling: [{{"input_text": {{"index": 1, "text": "username"}}}}, {{"input_text": {{"index": 2, "text": "password"}}}}, {{"click_element": {{"index": 3}}}}]
+- Navigation and extraction: [{{"go_to_url": {{"url": "https://example.com"}}}}, {{"extract_content": {{"goal": "extract the names"}}}}]
+- Actions are executed in the given order
+- If the page changes after an action, the sequence is interrupted and you get the new state.
+- Only provide the action sequence until an action which changes the page state significantly.
+- Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page
+- only use multiple actions if it makes sense.
+- Only chose from below available actions.
+3. ELEMENT INTERACTION:
+- Only use indexes of the interactive elements
+- Elements marked with "[]Non-interactive text" are non-interactive
+4. NAVIGATION & ERROR HANDLING:
+- If no suitable elements exist, use other functions to complete the task
+- If stuck, try alternative approaches - like going back to a previous page, new search, new tab etc.
+- Handle popups/cookies by accepting or closing them
+- Use scroll to find elements you are looking for
+- If you want to research something, open a new tab instead of using the current tab
+- If captcha pops up, try to solve it - else try a different approach
+- If the page is not fully loaded, use wait action
+5. TASK COMPLETION:
+- Use the done action as the last action as soon as the ultimate task is complete
+- Dont use "done" before you are done with everything the user asked you, except you reach the last step of max_steps.
+- If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completly finished set success to true. If not everything the user asked for is completed set success in done to false!
+- If you have to do something repeatedly for example the task says for "each", or "for all", or "x times", count always inside "memory" how many times you have done it and how many remain. Don't stop until you have completed like the task asked you. Only call done after the last step.
+- Don't hallucinate actions
+- Make sure you include everything you found out for the ultimate task in the done text parameter. Do not just say you are done, but include the requested information of the task.
+6. VISUAL CONTEXT:
+- When an image is provided, use it to understand the page layout
+- Bounding boxes with labels on their top right corner correspond to element indexes
+7. Form filling:
+- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
+8. Long tasks:
+- Keep track of the status and subresults in the memory.
+9. Extraction:
+- If your task is to find information - call extract_content on the specific pages to get and store the information.
+Your responses must be always JSON with the specified format.
+Available Actions:
+{available_actions}

src/agent/custom_views.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from dataclasses import dataclass
+from typing import Any, Dict, List, Literal, Optional, Type
+import uuid
+from browser_use.agent.views import AgentOutput, AgentState, ActionResult, AgentHistoryList, MessageManagerState
+from browser_use.controller.registry.views import ActionModel
+from pydantic import BaseModel, ConfigDict, Field, create_model
+@dataclass
+class CustomAgentStepInfo:
+    step_number: int
+    max_steps: int
+    task: str
+    add_infos: str
+    memory: str
+class CustomAgentBrain(BaseModel):
+    """Current state of the agent"""
+    evaluation_previous_goal: str
+    important_contents: str
+    thought: str
+    next_goal: str
+class CustomAgentOutput(AgentOutput):
+    """Output model for agent
+    @dev note: this model is extended with custom actions in AgentService. You can also use some fields that are not in this model as provided by the linter, as long as they are registered in the DynamicActions model.
+    """
+    current_state: CustomAgentBrain
+    @staticmethod
+    def type_with_custom_actions(
+            custom_actions: Type[ActionModel],
+    ) -> Type["CustomAgentOutput"]:
+        """Extend actions with custom actions"""
+        model_ = create_model(
+            "CustomAgentOutput",
+            __base__=CustomAgentOutput,
+            action=(
+                list[custom_actions],
+                Field(..., description='List of actions to execute', json_schema_extra={'min_items': 1}),
+            ),  # Properly annotated field with no default
+            __module__=CustomAgentOutput.__module__,
+        )
+        model_.__doc__ = 'AgentOutput model with custom actions'
+        return model_
+class CustomAgentState(BaseModel):
+    agent_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
+    n_steps: int = 1
+    consecutive_failures: int = 0
+    last_result: Optional[List['ActionResult']] = None
+    history: AgentHistoryList = Field(default_factory=lambda: AgentHistoryList(history=[]))
+    last_plan: Optional[str] = None
+    paused: bool = False
+    stopped: bool = False
+    message_manager_state: MessageManagerState = Field(default_factory=MessageManagerState)
+    last_action: Optional[List['ActionModel']] = None
+    extracted_content: str = ''

src/browser/__init__.py ADDED Viewed

File without changes

src/browser/custom_browser.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import asyncio
+import pdb
+from playwright.async_api import Browser as PlaywrightBrowser
+from playwright.async_api import (
+    BrowserContext as PlaywrightBrowserContext,
+)
+from playwright.async_api import (
+    Playwright,
+    async_playwright,
+)
+from browser_use.browser.browser import Browser
+from browser_use.browser.context import BrowserContext, BrowserContextConfig
+from playwright.async_api import BrowserContext as PlaywrightBrowserContext
+import logging
+from .custom_context import CustomBrowserContext
+logger = logging.getLogger(__name__)
+class CustomBrowser(Browser):
+    async def new_context(
+            self,
+            config: BrowserContextConfig = BrowserContextConfig()
+    ) -> CustomBrowserContext:
+        return CustomBrowserContext(config=config, browser=self)

src/browser/custom_context.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import json
+import logging
+import os
+from browser_use.browser.browser import Browser
+from browser_use.browser.context import BrowserContext, BrowserContextConfig
+from playwright.async_api import Browser as PlaywrightBrowser
+from playwright.async_api import BrowserContext as PlaywrightBrowserContext
+logger = logging.getLogger(__name__)
+class CustomBrowserContext(BrowserContext):
+    def __init__(
+            self,
+            browser: "Browser",
+            config: BrowserContextConfig = BrowserContextConfig()
+    ):
+        super(CustomBrowserContext, self).__init__(browser=browser, config=config)

src/controller/__init__.py ADDED Viewed

File without changes

src/controller/custom_controller.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import pdb
+import pyperclip
+from typing import Optional, Type
+from pydantic import BaseModel
+from browser_use.agent.views import ActionResult
+from browser_use.browser.context import BrowserContext
+from browser_use.controller.service import Controller, DoneAction
+from main_content_extractor import MainContentExtractor
+from browser_use.controller.views import (
+    ClickElementAction,
+    DoneAction,
+    ExtractPageContentAction,
+    GoToUrlAction,
+    InputTextAction,
+    OpenTabAction,
+    ScrollAction,
+    SearchGoogleAction,
+    SendKeysAction,
+    SwitchTabAction,
+)
+import logging
+logger = logging.getLogger(__name__)
+class CustomController(Controller):
+    def __init__(self, exclude_actions: list[str] = [],
+                 output_model: Optional[Type[BaseModel]] = None
+                 ):
+        super().__init__(exclude_actions=exclude_actions, output_model=output_model)
+        self._register_custom_actions()
+    def _register_custom_actions(self):
+        """Register all custom browser actions"""
+        @self.registry.action("Copy text to clipboard")
+        def copy_to_clipboard(text: str):
+            pyperclip.copy(text)
+            return ActionResult(extracted_content=text)
+        @self.registry.action("Paste text from clipboard")
+        async def paste_from_clipboard(browser: BrowserContext):
+            text = pyperclip.paste()
+            # send text to browser
+            page = await browser.get_current_page()
+            await page.keyboard.type(text)
+            return ActionResult(extracted_content=text)

src/utils/__init__.py ADDED Viewed

File without changes

src/utils/agent_state.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import asyncio
+class AgentState:
+    _instance = None
+    def __init__(self):
+        if not hasattr(self, '_stop_requested'):
+            self._stop_requested = asyncio.Event()
+            self.last_valid_state = None  # store the last valid browser state
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(AgentState, cls).__new__(cls)
+        return cls._instance
+    def request_stop(self):
+        self._stop_requested.set()
+    def clear_stop(self):
+        self._stop_requested.clear()
+        self.last_valid_state = None
+    def is_stop_requested(self):
+        return self._stop_requested.is_set()
+    def set_last_valid_state(self, state):
+        self.last_valid_state = state
+    def get_last_valid_state(self):
+        return self.last_valid_state

src/utils/deep_research.py ADDED Viewed

	@@ -0,0 +1,387 @@

+import pdb
+from dotenv import load_dotenv
+load_dotenv()
+import asyncio
+import os
+import sys
+import logging
+from pprint import pprint
+from uuid import uuid4
+from src.utils import utils
+from src.agent.custom_agent import CustomAgent
+import json
+import re
+from browser_use.agent.service import Agent
+from browser_use.browser.browser import BrowserConfig, Browser
+from browser_use.agent.views import ActionResult
+from browser_use.browser.context import BrowserContext
+from browser_use.controller.service import Controller, DoneAction
+from main_content_extractor import MainContentExtractor
+from langchain_core.messages import (
+    AIMessage,
+    BaseMessage,
+    HumanMessage,
+    ToolMessage,
+    SystemMessage
+)
+from json_repair import repair_json
+from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
+from src.controller.custom_controller import CustomController
+from src.browser.custom_browser import CustomBrowser
+from src.browser.custom_context import BrowserContextConfig, BrowserContext
+from browser_use.browser.context import (
+    BrowserContextConfig,
+    BrowserContextWindowSize,
+)
+logger = logging.getLogger(__name__)
+async def deep_research(task, llm, agent_state=None, **kwargs):
+    task_id = str(uuid4())
+    save_dir = kwargs.get("save_dir", os.path.join(f"./tmp/deep_research/{task_id}"))
+    logger.info(f"Save Deep Research at: {save_dir}")
+    os.makedirs(save_dir, exist_ok=True)
+    # max qyery num per iteration
+    max_query_num = kwargs.get("max_query_num", 3)
+    use_own_browser = kwargs.get("use_own_browser", False)
+    extra_chromium_args = []
+    if use_own_browser:
+        cdp_url = os.getenv("CHROME_CDP", kwargs.get("chrome_cdp", None))
+        # TODO: if use own browser, max query num must be 1 per iter, how to solve it?
+        max_query_num = 1
+        chrome_path = os.getenv("CHROME_PATH", None)
+        if chrome_path == "":
+            chrome_path = None
+        chrome_user_data = os.getenv("CHROME_USER_DATA", None)
+        if chrome_user_data:
+            extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
+        browser = CustomBrowser(
+            config=BrowserConfig(
+                headless=kwargs.get("headless", False),
+                cdp_url=cdp_url,
+                disable_security=kwargs.get("disable_security", True),
+                chrome_instance_path=chrome_path,
+                extra_chromium_args=extra_chromium_args,
+            )
+        )
+        browser_context = await browser.new_context()
+    else:
+        browser = None
+        browser_context = None
+    controller = CustomController()
+    @controller.registry.action(
+        'Extract page content to get the pure markdown.',
+    )
+    async def extract_content(browser: BrowserContext):
+        page = await browser.get_current_page()
+        # use jina reader
+        url = page.url
+        jina_url = f"https://r.jina.ai/{url}"
+        await page.goto(jina_url)
+        output_format = 'markdown'
+        content = MainContentExtractor.extract(  # type: ignore
+            html=await page.content(),
+            output_format=output_format,
+        )
+        # go back to org url
+        await page.go_back()
+        msg = f'Extracted page content:\n{content}\n'
+        logger.info(msg)
+        return ActionResult(extracted_content=msg)
+    search_system_prompt = f"""
+    You are a **Deep Researcher**, an AI agent specializing in in-depth information gathering and research using a web browser with **automated execution capabilities**. Your expertise lies in formulating comprehensive research plans and executing them meticulously to fulfill complex user requests. You will analyze user instructions, devise a detailed research plan, and determine the necessary search queries to gather the required information.
+    **Your Task:**
+    Given a user's research topic, you will:
+    1. **Develop a Research Plan:** Outline the key aspects and subtopics that need to be investigated to thoroughly address the user's request. This plan should be a high-level overview of the research direction.
+    2. **Generate Search Queries:** Based on your research plan, generate a list of specific search queries to be executed in a web browser. These queries should be designed to efficiently gather relevant information for each aspect of your plan.
+    **Output Format:**
+    Your output will be a JSON object with the following structure:
+    ```json
+    {{
+    "plan": "A concise, high-level research plan outlining the key areas to investigate.",
+      "queries": [
+        "search query 1",
+        "search query 2",
+        //... up to a maximum of {max_query_num} search queries
+      ]
+    }}
+    ```
+    **Important:**
+    *   Limit your output to a **maximum of {max_query_num}** search queries.
+    *   Make the search queries to help the automated agent find the needed information. Consider what keywords are most likely to lead to useful results.
+    *   If you have gathered for all the information you want and no further search queries are required, output queries with an empty list: `[]`
+    *   Make sure output search queries are different from the history queries.
+    **Inputs:**
+    1.  **User Instruction:** The original instruction given by the user.
+    2.  **Previous Queries:** History Queries.
+    3.  **Previous Search Results:** Textual data gathered from prior search queries. If there are no previous search results this string will be empty.
+    """
+    search_messages = [SystemMessage(content=search_system_prompt)]
+    record_system_prompt = """
+    You are an expert information recorder. Your role is to process user instructions, current search results, and previously recorded information to extract, summarize, and record new, useful information that helps fulfill the user's request. Your output will be a JSON formatted list, where each element represents a piece of extracted information and follows the structure: `{"url": "source_url", "title": "source_title", "summary_content": "concise_summary", "thinking": "reasoning"}`.
+**Important Considerations:**
+1. **Minimize Information Loss:** While concise, prioritize retaining important details and nuances from the sources. Aim for a summary that captures the essence of the information without over-simplification. **Crucially, ensure to preserve key data and figures within the `summary_content`. This is essential for later stages, such as generating tables and reports.**
+2. **Avoid Redundancy:** Do not record information that is already present in the Previous Recorded Information. Check for semantic similarity, not just exact matches. However, if the same information is expressed differently in a new source and this variation adds valuable context or clarity, it should be included.
+3. **Source Information:** Extract and include the source title and URL for each piece of information summarized. This is crucial for verification and context. **The Current Search Results are provided in a specific format, where each item starts with "Title:", followed by the title, then "URL Source:", followed by the URL, and finally "Markdown Content:", followed by the content. Please extract the title and URL from this structure.** If a piece of information cannot be attributed to a specific source from the provided search results, use `"url": "unknown"` and `"title": "unknown"`.
+4. **Thinking and Report Structure:**  For each extracted piece of information, add a `"thinking"` key. This field should contain your assessment of how this information could be used in a report, which section it might belong to (e.g., introduction, background, analysis, conclusion, specific subtopics), and any other relevant thoughts about its significance or connection to other information.
+**Output Format:**
+Provide your output as a JSON formatted list. Each item in the list must adhere to the following format:
+```json
+[
+  {
+    "url": "source_url_1",
+    "title": "source_title_1",
+    "summary_content": "Concise summary of content. Remember to include key data and figures here.",
+    "thinking": "This could be used in the introduction to set the context. It also relates to the section on the history of the topic."
+  },
+  // ... more entries
+  {
+    "url": "unknown",
+    "title": "unknown",
+    "summary_content": "concise_summary_of_content_without_clear_source",
+    "thinking": "This might be useful background information, but I need to verify its accuracy. Could be used in the methodology section to explain how data was collected."
+  }
+]
+```
+**Inputs:**
+1. **User Instruction:** The original instruction given by the user. This helps you determine what kind of information will be useful and how to structure your thinking.
+2. **Previous Recorded Information:** Textual data gathered and recorded from previous searches and processing, represented as a single text string.
+3. **Current Search Plan:** Research plan for current search.
+4. **Current Search Query:** The current search query.
+5. **Current Search Results:** Textual data gathered from the most recent search query.
+    """
+    record_messages = [SystemMessage(content=record_system_prompt)]
+    search_iteration = 0
+    max_search_iterations = kwargs.get("max_search_iterations", 10)  # Limit search iterations to prevent infinite loop
+    use_vision = kwargs.get("use_vision", False)
+    history_query = []
+    history_infos = []
+    try:
+        while search_iteration < max_search_iterations:
+            search_iteration += 1
+            logger.info(f"Start {search_iteration}th Search...")
+            history_query_ = json.dumps(history_query, indent=4)
+            history_infos_ = json.dumps(history_infos, indent=4)
+            query_prompt = f"This is search {search_iteration} of {max_search_iterations} maximum searches allowed.\n User Instruction:{task} \n Previous Queries:\n {history_query_} \n Previous Search Results:\n {history_infos_}\n"
+            search_messages.append(HumanMessage(content=query_prompt))
+            ai_query_msg = llm.invoke(search_messages[:1] + search_messages[1:][-1:])
+            search_messages.append(ai_query_msg)
+            if hasattr(ai_query_msg, "reasoning_content"):
+                logger.info("🤯 Start Search Deep Thinking: ")
+                logger.info(ai_query_msg.reasoning_content)
+                logger.info("🤯 End Search Deep Thinking")
+            ai_query_content = ai_query_msg.content.replace("```json", "").replace("```", "")
+            ai_query_content = repair_json(ai_query_content)
+            ai_query_content = json.loads(ai_query_content)
+            query_plan = ai_query_content["plan"]
+            logger.info(f"Current Iteration {search_iteration} Planing:")
+            logger.info(query_plan)
+            query_tasks = ai_query_content["queries"]
+            if not query_tasks:
+                break
+            else:
+                query_tasks = query_tasks[:max_query_num]
+                history_query.extend(query_tasks)
+                logger.info("Query tasks:")
+                logger.info(query_tasks)
+            # 2. Perform Web Search and Auto exec
+            # Parallel BU agents
+            add_infos = "1. Please click on the most relevant link to get information and go deeper, instead of just staying on the search page. \n" \
+                        "2. When opening a PDF file, please remember to extract the content using extract_content instead of simply opening it for the user to view.\n"
+            if use_own_browser:
+                agent = CustomAgent(
+                    task=query_tasks[0],
+                    llm=llm,
+                    add_infos=add_infos,
+                    browser=browser,
+                    browser_context=browser_context,
+                    use_vision=use_vision,
+                    system_prompt_class=CustomSystemPrompt,
+                    agent_prompt_class=CustomAgentMessagePrompt,
+                    max_actions_per_step=5,
+                    controller=controller
+                )
+                agent_result = await agent.run(max_steps=kwargs.get("max_steps", 10))
+                query_results = [agent_result]
+                # Manually close all tab
+                session = await browser_context.get_session()
+                pages = session.context.pages
+                await browser_context.create_new_tab()
+                for page_id, page in enumerate(pages):
+                    await page.close()
+            else:
+                agents = [CustomAgent(
+                    task=task,
+                    llm=llm,
+                    add_infos=add_infos,
+                    browser=browser,
+                    browser_context=browser_context,
+                    use_vision=use_vision,
+                    system_prompt_class=CustomSystemPrompt,
+                    agent_prompt_class=CustomAgentMessagePrompt,
+                    max_actions_per_step=5,
+                    controller=controller,
+                ) for task in query_tasks]
+                query_results = await asyncio.gather(
+                    *[agent.run(max_steps=kwargs.get("max_steps", 10)) for agent in agents])
+            if agent_state and agent_state.is_stop_requested():
+                # Stop
+                break
+            # 3. Summarize Search Result
+            query_result_dir = os.path.join(save_dir, "query_results")
+            os.makedirs(query_result_dir, exist_ok=True)
+            for i in range(len(query_tasks)):
+                query_result = query_results[i].final_result()
+                if not query_result:
+                    continue
+                querr_save_path = os.path.join(query_result_dir, f"{search_iteration}-{i}.md")
+                logger.info(f"save query: {query_tasks[i]} at {querr_save_path}")
+                with open(querr_save_path, "w", encoding="utf-8") as fw:
+                    fw.write(f"Query: {query_tasks[i]}\n")
+                    fw.write(query_result)
+                # split query result in case the content is too long
+                query_results_split = query_result.split("Extracted page content:")
+                for qi, query_result_ in enumerate(query_results_split):
+                    if not query_result_:
+                        continue
+                    else:
+                        # TODO: limit content lenght: 128k tokens, ~3 chars per token
+                        query_result_ = query_result_[:128000 * 3]
+                    history_infos_ = json.dumps(history_infos, indent=4)
+                    record_prompt = f"User Instruction:{task}. \nPrevious Recorded Information:\n {history_infos_}\n Current Search Iteration: {search_iteration}\n Current Search Plan:\n{query_plan}\n Current Search Query:\n {query_tasks[i]}\n Current Search Results: {query_result_}\n "
+                    record_messages.append(HumanMessage(content=record_prompt))
+                    ai_record_msg = llm.invoke(record_messages[:1] + record_messages[-1:])
+                    record_messages.append(ai_record_msg)
+                    if hasattr(ai_record_msg, "reasoning_content"):
+                        logger.info("🤯 Start Record Deep Thinking: ")
+                        logger.info(ai_record_msg.reasoning_content)
+                        logger.info("🤯 End Record Deep Thinking")
+                    record_content = ai_record_msg.content
+                    record_content = repair_json(record_content)
+                    new_record_infos = json.loads(record_content)
+                    history_infos.extend(new_record_infos)
+            if agent_state and agent_state.is_stop_requested():
+                # Stop
+                break
+        logger.info("\nFinish Searching, Start Generating Report...")
+        # 5. Report Generation in Markdown (or JSON if you prefer)
+        return await generate_final_report(task, history_infos, save_dir, llm)
+    except Exception as e:
+        logger.error(f"Deep research Error: {e}")
+        return await generate_final_report(task, history_infos, save_dir, llm, str(e))
+    finally:
+        if browser:
+            await browser.close()
+        if browser_context:
+            await browser_context.close()
+        logger.info("Browser closed.")
+async def generate_final_report(task, history_infos, save_dir, llm, error_msg=None):
+    """Generate report from collected information with error handling"""
+    try:
+        logger.info("\nAttempting to generate final report from collected data...")
+        writer_system_prompt = """
+        You are a **Deep Researcher** and a professional report writer tasked with creating polished, high-quality reports that fully meet the user's needs, based on the user's instructions and the relevant information provided. You will write the report using Markdown format, ensuring it is both informative and visually appealing.
+**Specific Instructions:**
+*   **Structure for Impact:** The report must have a clear, logical, and impactful structure. Begin with a compelling introduction that immediately grabs the reader's attention. Develop well-structured body paragraphs that flow smoothly and logically, and conclude with a concise and memorable conclusion that summarizes key takeaways and leaves a lasting impression.
+*   **Engaging and Vivid Language:** Employ precise, vivid, and descriptive language to make the report captivating and enjoyable to read. Use stylistic techniques to enhance engagement. Tailor your tone, vocabulary, and writing style to perfectly suit the subject matter and the intended audience to maximize impact and readability.
+*   **Accuracy, Credibility, and Citations:** Ensure that all information presented is meticulously accurate, rigorously truthful, and robustly supported by the available data. **Cite sources exclusively using bracketed sequential numbers within the text (e.g., [1], [2], etc.). If no references are used, omit citations entirely.** These numbers must correspond to a numbered list of references at the end of the report.
+*   **Publication-Ready Formatting:** Adhere strictly to Markdown formatting for excellent readability and a clean, highly professional visual appearance. Pay close attention to formatting details like headings, lists, emphasis, and spacing to optimize the visual presentation and reader experience. The report should be ready for immediate publication upon completion, requiring minimal to no further editing for style or format.
+*   **Conciseness and Clarity (Unless Specified Otherwise):** When the user does not provide a specific length, prioritize concise and to-the-point writing, maximizing information density while maintaining clarity.
+*   **Data-Driven Comparisons with Tables:**  **When appropriate and beneficial for enhancing clarity and impact, present data comparisons in well-structured Markdown tables. This is especially encouraged when dealing with numerical data or when a visual comparison can significantly improve the reader's understanding.**
+*   **Length Adherence:** When the user specifies a length constraint, meticulously stay within reasonable bounds of that specification, ensuring the content is appropriately scaled without sacrificing quality or completeness.
+*   **Comprehensive Instruction Following:** Pay meticulous attention to all details and nuances provided in the user instructions. Strive to fulfill every aspect of the user's request with the highest degree of accuracy and attention to detail, creating a report that not only meets but exceeds expectations for quality and professionalism.
+*   **Reference List Formatting:** The reference list at the end must be formatted as follows:
+    `[1] Title (URL, if available)`
+    **Each reference must be separated by a blank line to ensure proper spacing.** For example:
+    ```
+    [1] Title 1 (URL1, if available)
+    [2] Title 2 (URL2, if available)
+    ```
+    **Furthermore, ensure that the reference list is free of duplicates. Each unique source should be listed only once, regardless of how many times it is cited in the text.**
+*   **ABSOLUTE FINAL OUTPUT RESTRICTION:**  **Your output must contain ONLY the finished, publication-ready Markdown report. Do not include ANY extraneous text, phrases, preambles, meta-commentary, or markdown code indicators (e.g., "```markdown```"). The report should begin directly with the title and introductory paragraph, and end directly after the conclusion and the reference list (if applicable).**  **Your response will be deemed a failure if this instruction is not followed precisely.**
+**Inputs:**
+1. **User Instruction:** The original instruction given by the user. This helps you determine what kind of information will be useful and how to structure your thinking.
+2. **Search Information:** Information gathered from the search queries.
+        """
+        history_infos_ = json.dumps(history_infos, indent=4)
+        record_json_path = os.path.join(save_dir, "record_infos.json")
+        logger.info(f"save All recorded information at {record_json_path}")
+        with open(record_json_path, "w") as fw:
+            json.dump(history_infos, fw, indent=4)
+        report_prompt = f"User Instruction:{task} \n Search Information:\n {history_infos_}"
+        report_messages = [SystemMessage(content=writer_system_prompt),
+                           HumanMessage(content=report_prompt)]  # New context for report generation
+        ai_report_msg = llm.invoke(report_messages)
+        if hasattr(ai_report_msg, "reasoning_content"):
+            logger.info("🤯 Start Report Deep Thinking: ")
+            logger.info(ai_report_msg.reasoning_content)
+            logger.info("🤯 End Report Deep Thinking")
+        report_content = ai_report_msg.content
+        report_content = re.sub(r"^```\s*markdown\s*|^\s*```|```\s*$", "", report_content, flags=re.MULTILINE)
+        report_content = report_content.strip()
+        # Add error notification to the report
+        if error_msg:
+            report_content = f"## ⚠️ Research Incomplete - Partial Results\n" \
+                             f"**The research process was interrupted by an error:** {error_msg}\n\n" \
+                             f"{report_content}"
+        report_file_path = os.path.join(save_dir, "final_report.md")
+        with open(report_file_path, "w", encoding="utf-8") as f:
+            f.write(report_content)
+        logger.info(f"Save Report at: {report_file_path}")
+        return report_content, report_file_path
+    except Exception as report_error:
+        logger.error(f"Failed to generate partial report: {report_error}")
+        return f"Error generating report: {str(report_error)}", None

src/utils/llm.py ADDED Viewed

	@@ -0,0 +1,138 @@

+from openai import OpenAI
+import pdb
+from langchain_openai import ChatOpenAI
+from langchain_core.globals import get_llm_cache
+from langchain_core.language_models.base import (
+    BaseLanguageModel,
+    LangSmithParams,
+    LanguageModelInput,
+)
+from langchain_core.load import dumpd, dumps
+from langchain_core.messages import (
+    AIMessage,
+    SystemMessage,
+    AnyMessage,
+    BaseMessage,
+    BaseMessageChunk,
+    HumanMessage,
+    convert_to_messages,
+    message_chunk_to_message,
+)
+from langchain_core.outputs import (
+    ChatGeneration,
+    ChatGenerationChunk,
+    ChatResult,
+    LLMResult,
+    RunInfo,
+)
+from langchain_ollama import ChatOllama
+from langchain_core.output_parsers.base import OutputParserLike
+from langchain_core.runnables import Runnable, RunnableConfig
+from langchain_core.tools import BaseTool
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Literal,
+    Optional,
+    Union,
+    cast, List,
+)
+class DeepSeekR1ChatOpenAI(ChatOpenAI):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self.client = OpenAI(
+            base_url=kwargs.get("base_url"),
+            api_key=kwargs.get("api_key")
+        )
+    async def ainvoke(
+            self,
+            input: LanguageModelInput,
+            config: Optional[RunnableConfig] = None,
+            *,
+            stop: Optional[list[str]] = None,
+            **kwargs: Any,
+    ) -> AIMessage:
+        message_history = []
+        for input_ in input:
+            if isinstance(input_, SystemMessage):
+                message_history.append({"role": "system", "content": input_.content})
+            elif isinstance(input_, AIMessage):
+                message_history.append({"role": "assistant", "content": input_.content})
+            else:
+                message_history.append({"role": "user", "content": input_.content})
+        response = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=message_history
+        )
+        reasoning_content = response.choices[0].message.reasoning_content
+        content = response.choices[0].message.content
+        return AIMessage(content=content, reasoning_content=reasoning_content)
+    def invoke(
+            self,
+            input: LanguageModelInput,
+            config: Optional[RunnableConfig] = None,
+            *,
+            stop: Optional[list[str]] = None,
+            **kwargs: Any,
+    ) -> AIMessage:
+        message_history = []
+        for input_ in input:
+            if isinstance(input_, SystemMessage):
+                message_history.append({"role": "system", "content": input_.content})
+            elif isinstance(input_, AIMessage):
+                message_history.append({"role": "assistant", "content": input_.content})
+            else:
+                message_history.append({"role": "user", "content": input_.content})
+        response = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=message_history
+        )
+        reasoning_content = response.choices[0].message.reasoning_content
+        content = response.choices[0].message.content
+        return AIMessage(content=content, reasoning_content=reasoning_content)
+class DeepSeekR1ChatOllama(ChatOllama):
+    async def ainvoke(
+            self,
+            input: LanguageModelInput,
+            config: Optional[RunnableConfig] = None,
+            *,
+            stop: Optional[list[str]] = None,
+            **kwargs: Any,
+    ) -> AIMessage:
+        org_ai_message = await super().ainvoke(input=input)
+        org_content = org_ai_message.content
+        reasoning_content = org_content.split("</think>")[0].replace("<think>", "")
+        content = org_content.split("</think>")[1]
+        if "**JSON Response:**" in content:
+            content = content.split("**JSON Response:**")[-1]
+        return AIMessage(content=content, reasoning_content=reasoning_content)
+    def invoke(
+            self,
+            input: LanguageModelInput,
+            config: Optional[RunnableConfig] = None,
+            *,
+            stop: Optional[list[str]] = None,
+            **kwargs: Any,
+    ) -> AIMessage:
+        org_ai_message = super().invoke(input=input)
+        org_content = org_ai_message.content
+        reasoning_content = org_content.split("</think>")[0].replace("<think>", "")
+        content = org_content.split("</think>")[1]
+        if "**JSON Response:**" in content:
+            content = content.split("**JSON Response:**")[-1]
+        return AIMessage(content=content, reasoning_content=reasoning_content)

src/utils/utils.py ADDED Viewed

	@@ -0,0 +1,400 @@

+import base64
+import os
+import time
+from pathlib import Path
+from typing import Dict, Optional
+import requests
+import json
+import gradio as gr
+import uuid
+from langchain_anthropic import ChatAnthropic
+from langchain_mistralai import ChatMistralAI
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_ollama import ChatOllama
+from langchain_openai import AzureChatOpenAI, ChatOpenAI
+from .llm import DeepSeekR1ChatOpenAI, DeepSeekR1ChatOllama
+PROVIDER_DISPLAY_NAMES = {
+    "openai": "OpenAI",
+    "azure_openai": "Azure OpenAI",
+    "anthropic": "Anthropic",
+    "deepseek": "DeepSeek",
+    "google": "Google",
+    "alibaba": "Alibaba",
+    "moonshot": "MoonShot",
+    "unbound": "Unbound AI"
+}
+def get_llm_model(provider: str, **kwargs):
+    """
+    获取LLM 模型
+    :param provider: 模型类型
+    :param kwargs:
+    :return:
+    """
+    if provider not in ["ollama"]:
+        env_var = f"{provider.upper()}_API_KEY"
+        api_key = kwargs.get("api_key", "") or os.getenv(env_var, "")
+        if not api_key:
+            raise MissingAPIKeyError(provider, env_var)
+        kwargs["api_key"] = api_key
+    if provider == "anthropic":
+        if not kwargs.get("base_url", ""):
+            base_url = "https://api.anthropic.com"
+        else:
+            base_url = kwargs.get("base_url")
+        return ChatAnthropic(
+            model=kwargs.get("model_name", "claude-3-5-sonnet-20241022"),
+            temperature=kwargs.get("temperature", 0.0),
+            base_url=base_url,
+            api_key=api_key,
+        )
+    elif provider == 'mistral':
+        if not kwargs.get("base_url", ""):
+            base_url = os.getenv("MISTRAL_ENDPOINT", "https://api.mistral.ai/v1")
+        else:
+            base_url = kwargs.get("base_url")
+        if not kwargs.get("api_key", ""):
+            api_key = os.getenv("MISTRAL_API_KEY", "")
+        else:
+            api_key = kwargs.get("api_key")
+        return ChatMistralAI(
+            model=kwargs.get("model_name", "mistral-large-latest"),
+            temperature=kwargs.get("temperature", 0.0),
+            base_url=base_url,
+            api_key=api_key,
+        )
+    elif provider == "openai":
+        if not kwargs.get("base_url", ""):
+            base_url = os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1")
+        else:
+            base_url = kwargs.get("base_url")
+        return ChatOpenAI(
+            model=kwargs.get("model_name", "gpt-4o"),
+            temperature=kwargs.get("temperature", 0.0),
+            base_url=base_url,
+            api_key=api_key,
+        )
+    elif provider == "deepseek":
+        if not kwargs.get("base_url", ""):
+            base_url = os.getenv("DEEPSEEK_ENDPOINT", "")
+        else:
+            base_url = kwargs.get("base_url")
+        if kwargs.get("model_name", "deepseek-chat") == "deepseek-reasoner":
+            return DeepSeekR1ChatOpenAI(
+                model=kwargs.get("model_name", "deepseek-reasoner"),
+                temperature=kwargs.get("temperature", 0.0),
+                base_url=base_url,
+                api_key=api_key,
+            )
+        else:
+            return ChatOpenAI(
+                model=kwargs.get("model_name", "deepseek-chat"),
+                temperature=kwargs.get("temperature", 0.0),
+                base_url=base_url,
+                api_key=api_key,
+            )
+    elif provider == "google":
+        return ChatGoogleGenerativeAI(
+            model=kwargs.get("model_name", "gemini-2.0-flash-exp"),
+            temperature=kwargs.get("temperature", 0.0),
+            api_key=api_key,
+        )
+    elif provider == "ollama":
+        if not kwargs.get("base_url", ""):
+            base_url = os.getenv("OLLAMA_ENDPOINT", "http://localhost:11434")
+        else:
+            base_url = kwargs.get("base_url")
+        if "deepseek-r1" in kwargs.get("model_name", "qwen2.5:7b"):
+            return DeepSeekR1ChatOllama(
+                model=kwargs.get("model_name", "deepseek-r1:14b"),
+                temperature=kwargs.get("temperature", 0.0),
+                num_ctx=kwargs.get("num_ctx", 32000),
+                base_url=base_url,
+            )
+        else:
+            return ChatOllama(
+                model=kwargs.get("model_name", "qwen2.5:7b"),
+                temperature=kwargs.get("temperature", 0.0),
+                num_ctx=kwargs.get("num_ctx", 32000),
+                num_predict=kwargs.get("num_predict", 1024),
+                base_url=base_url,
+            )
+    elif provider == "azure_openai":
+        if not kwargs.get("base_url", ""):
+            base_url = os.getenv("AZURE_OPENAI_ENDPOINT", "")
+        else:
+            base_url = kwargs.get("base_url")
+        api_version = kwargs.get("api_version", "") or os.getenv("AZURE_OPENAI_API_VERSION", "2025-01-01-preview")
+        return AzureChatOpenAI(
+            model=kwargs.get("model_name", "gpt-4o"),
+            temperature=kwargs.get("temperature", 0.0),
+            api_version=api_version,
+            azure_endpoint=base_url,
+            api_key=api_key,
+        )
+    elif provider == "alibaba":
+        if not kwargs.get("base_url", ""):
+            base_url = os.getenv("ALIBABA_ENDPOINT", "https://dashscope.aliyuncs.com/compatible-mode/v1")
+        else:
+            base_url = kwargs.get("base_url")
+        return ChatOpenAI(
+            model=kwargs.get("model_name", "qwen-plus"),
+            temperature=kwargs.get("temperature", 0.0),
+            base_url=base_url,
+            api_key=api_key,
+        )
+    elif provider == "moonshot":
+        return ChatOpenAI(
+            model=kwargs.get("model_name", "moonshot-v1-32k-vision-preview"),
+            temperature=kwargs.get("temperature", 0.0),
+            base_url=os.getenv("MOONSHOT_ENDPOINT"),
+            api_key=os.getenv("MOONSHOT_API_KEY"),
+        )
+    elif provider == "unbound":
+        return ChatOpenAI(
+            model=kwargs.get("model_name", "gpt-4o-mini"),
+            temperature=kwargs.get("temperature", 0.0),
+            base_url=os.getenv("UNBOUND_ENDPOINT", "https://api.getunbound.ai"),
+            api_key=api_key,
+        )
+    elif provider == "siliconflow":
+        if not kwargs.get("api_key", ""):
+            api_key = os.getenv("SiliconFLOW_API_KEY", "")
+        else:
+            api_key = kwargs.get("api_key")
+        if not kwargs.get("base_url", ""):
+            base_url = os.getenv("SiliconFLOW_ENDPOINT", "")
+        else:
+            base_url = kwargs.get("base_url")
+        return ChatOpenAI(
+            api_key=api_key,
+            base_url=base_url,
+            model_name=kwargs.get("model_name", "Qwen/QwQ-32B"),
+            temperature=kwargs.get("temperature", 0.0),
+        )
+    else:
+        raise ValueError(f"Unsupported provider: {provider}")
+# Predefined model names for common providers
+model_names = {
+    "anthropic": ["claude-3-5-sonnet-20241022", "claude-3-5-sonnet-20240620", "claude-3-opus-20240229"],
+    "openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo", "o3-mini"],
+    "deepseek": ["deepseek-chat", "deepseek-reasoner"],
+    "google": ["gemini-2.0-flash", "gemini-2.0-flash-thinking-exp", "gemini-1.5-flash-latest",
+               "gemini-1.5-flash-8b-latest", "gemini-2.0-flash-thinking-exp-01-21", "gemini-2.0-pro-exp-02-05"],
+    "ollama": ["qwen2.5:7b", "qwen2.5:14b", "qwen2.5:32b", "qwen2.5-coder:14b", "qwen2.5-coder:32b", "llama2:7b",
+               "deepseek-r1:14b", "deepseek-r1:32b"],
+    "azure_openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"],
+    "mistral": ["pixtral-large-latest", "mistral-large-latest", "mistral-small-latest", "ministral-8b-latest"],
+    "alibaba": ["qwen-plus", "qwen-max", "qwen-turbo", "qwen-long"],
+    "moonshot": ["moonshot-v1-32k-vision-preview", "moonshot-v1-8k-vision-preview"],
+    "unbound": ["gemini-2.0-flash", "gpt-4o-mini", "gpt-4o", "gpt-4.5-preview"],
+    "siliconflow": [
+        "deepseek-ai/DeepSeek-R1",
+        "deepseek-ai/DeepSeek-V3",
+        "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
+        "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
+        "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
+        "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+        "deepseek-ai/DeepSeek-V2.5",
+        "deepseek-ai/deepseek-vl2",
+        "Qwen/Qwen2.5-72B-Instruct-128K",
+        "Qwen/Qwen2.5-72B-Instruct",
+        "Qwen/Qwen2.5-32B-Instruct",
+        "Qwen/Qwen2.5-14B-Instruct",
+        "Qwen/Qwen2.5-7B-Instruct",
+        "Qwen/Qwen2.5-Coder-32B-Instruct",
+        "Qwen/Qwen2.5-Coder-7B-Instruct",
+        "Qwen/Qwen2-7B-Instruct",
+        "Qwen/Qwen2-1.5B-Instruct",
+        "Qwen/QwQ-32B-Preview",
+        "Qwen/Qwen2-VL-72B-Instruct",
+        "Qwen/Qwen2.5-VL-32B-Instruct",
+        "Qwen/Qwen2.5-VL-72B-Instruct",
+        "TeleAI/TeleChat2",
+        "THUDM/glm-4-9b-chat",
+        "Vendor-A/Qwen/Qwen2.5-72B-Instruct",
+        "internlm/internlm2_5-7b-chat",
+        "internlm/internlm2_5-20b-chat",
+        "Pro/Qwen/Qwen2.5-7B-Instruct",
+        "Pro/Qwen/Qwen2-7B-Instruct",
+        "Pro/Qwen/Qwen2-1.5B-Instruct",
+        "Pro/THUDM/chatglm3-6b",
+        "Pro/THUDM/glm-4-9b-chat",
+    ],
+}
+# Callback to update the model name dropdown based on the selected provider
+def update_model_dropdown(llm_provider, api_key=None, base_url=None):
+    """
+    Update the model name dropdown with predefined models for the selected provider.
+    """
+    import gradio as gr
+    # Use API keys from .env if not provided
+    if not api_key:
+        api_key = os.getenv(f"{llm_provider.upper()}_API_KEY", "")
+    if not base_url:
+        base_url = os.getenv(f"{llm_provider.upper()}_BASE_URL", "")
+    # Use predefined models for the selected provider
+    if llm_provider in model_names:
+        return gr.Dropdown(choices=model_names[llm_provider], value=model_names[llm_provider][0], interactive=True)
+    else:
+        return gr.Dropdown(choices=[], value="", interactive=True, allow_custom_value=True)
+class MissingAPIKeyError(Exception):
+    """Custom exception for missing API key."""
+    def __init__(self, provider: str, env_var: str):
+        provider_display = PROVIDER_DISPLAY_NAMES.get(provider, provider.upper())
+        super().__init__(f"💥 {provider_display} API key not found! 🔑 Please set the "
+                         f"`{env_var}` environment variable or provide it in the UI.")
+def encode_image(img_path):
+    if not img_path:
+        return None
+    with open(img_path, "rb") as fin:
+        image_data = base64.b64encode(fin.read()).decode("utf-8")
+    return image_data
+def get_latest_files(directory: str, file_types: list = ['.webm', '.zip']) -> Dict[str, Optional[str]]:
+    """Get the latest recording and trace files"""
+    latest_files: Dict[str, Optional[str]] = {ext: None for ext in file_types}
+    if not os.path.exists(directory):
+        os.makedirs(directory, exist_ok=True)
+        return latest_files
+    for file_type in file_types:
+        try:
+            matches = list(Path(directory).rglob(f"*{file_type}"))
+            if matches:
+                latest = max(matches, key=lambda p: p.stat().st_mtime)
+                # Only return files that are complete (not being written)
+                if time.time() - latest.stat().st_mtime > 1.0:
+                    latest_files[file_type] = str(latest)
+        except Exception as e:
+            print(f"Error getting latest {file_type} file: {e}")
+    return latest_files
+async def capture_screenshot(browser_context):
+    """Capture and encode a screenshot"""
+    # Extract the Playwright browser instance
+    playwright_browser = browser_context.browser.playwright_browser  # Ensure this is correct.
+    # Check if the browser instance is valid and if an existing context can be reused
+    if playwright_browser and playwright_browser.contexts:
+        playwright_context = playwright_browser.contexts[0]
+    else:
+        return None
+    # Access pages in the context
+    pages = None
+    if playwright_context:
+        pages = playwright_context.pages
+    # Use an existing page or create a new one if none exist
+    if pages:
+        active_page = pages[0]
+        for page in pages:
+            if page.url != "about:blank":
+                active_page = page
+    else:
+        return None
+    # Take screenshot
+    try:
+        screenshot = await active_page.screenshot(
+            type='jpeg',
+            quality=75,
+            scale="css"
+        )
+        encoded = base64.b64encode(screenshot).decode('utf-8')
+        return encoded
+    except Exception as e:
+        return None
+class ConfigManager:
+    def __init__(self):
+        self.components = {}
+        self.component_order = []
+    def register_component(self, name: str, component):
+        """Register a gradio component for config management."""
+        self.components[name] = component
+        if name not in self.component_order:
+            self.component_order.append(name)
+        return component
+    def save_current_config(self):
+        """Save the current configuration of all registered components."""
+        current_config = {}
+        for name in self.component_order:
+            component = self.components[name]
+            # Get the current value from the component
+            current_config[name] = getattr(component, "value", None)
+        return save_config_to_file(current_config)
+    def update_ui_from_config(self, config_file):
+        """Update UI components from a loaded configuration file."""
+        if config_file is None:
+            return [gr.update() for _ in self.component_order] + ["No file selected."]
+        loaded_config = load_config_from_file(config_file.name)
+        if not isinstance(loaded_config, dict):
+            return [gr.update() for _ in self.component_order] + ["Error: Invalid configuration file."]
+        # Prepare updates for all components
+        updates = []
+        for name in self.component_order:
+            if name in loaded_config:
+                updates.append(gr.update(value=loaded_config[name]))
+            else:
+                updates.append(gr.update())
+        updates.append("Configuration loaded successfully.")
+        return updates
+    def get_all_components(self):
+        """Return all registered components in the order they were registered."""
+        return [self.components[name] for name in self.component_order]
+def load_config_from_file(config_file):
+    """Load settings from a config file (JSON format)."""
+    try:
+        with open(config_file, 'r') as f:
+            settings = json.load(f)
+        return settings
+    except Exception as e:
+        return f"Error loading configuration: {str(e)}"
+def save_config_to_file(settings, save_dir="./tmp/webui_settings"):
+    """Save the current settings to a UUID.json file with a UUID name."""
+    os.makedirs(save_dir, exist_ok=True)
+    config_file = os.path.join(save_dir, f"{uuid.uuid4()}.json")
+    with open(config_file, 'w') as f:
+        json.dump(settings, f, indent=2)
+    return f"Configuration saved to {config_file}"

supervisord.conf ADDED Viewed

	@@ -0,0 +1,96 @@

+[supervisord]
+user=root
+nodaemon=true
+logfile=/dev/stdout
+logfile_maxbytes=0
+loglevel=debug
+[program:xvfb]
+command=Xvfb :99 -screen 0 %(ENV_RESOLUTION)s -ac +extension GLX +render -noreset
+autorestart=true
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+priority=100
+startsecs=3
+stopsignal=TERM
+stopwaitsecs=10
+[program:vnc_setup]
+command=bash -c "mkdir -p ~/.vnc && echo '%(ENV_VNC_PASSWORD)s' | vncpasswd -f > ~/.vnc/passwd && chmod 600 ~/.vnc/passwd && ls -la ~/.vnc/passwd"
+autorestart=false
+startsecs=0
+priority=150
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+[program:x11vnc]
+command=bash -c "mkdir -p /var/log && touch /var/log/x11vnc.log && chmod 666 /var/log/x11vnc.log && sleep 5 && DISPLAY=:99 x11vnc -display :99 -forever -shared -rfbauth /root/.vnc/passwd -rfbport 5901 -o /var/log/x11vnc.log"
+autorestart=true
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+priority=200
+startretries=10
+startsecs=10
+stopsignal=TERM
+stopwaitsecs=10
+depends_on=vnc_setup,xvfb
+[program:x11vnc_log]
+command=bash -c "mkdir -p /var/log && touch /var/log/x11vnc.log && tail -f /var/log/x11vnc.log"
+autorestart=true
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+priority=250
+stopsignal=TERM
+stopwaitsecs=5
+depends_on=x11vnc
+[program:novnc]
+command=bash -c "sleep 5 && cd /opt/novnc && ./utils/novnc_proxy --vnc localhost:5901 --listen 0.0.0.0:6080 --web /opt/novnc"
+autorestart=true
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+priority=300
+startretries=5
+startsecs=3
+depends_on=x11vnc
+[program:persistent_browser]
+environment=START_URL="data:text/html,<html><body><h1>Browser Ready</h1></body></html>"
+command=bash -c "mkdir -p /app/data/chrome_data && sleep 8 && $(find /ms-playwright/chromium-*/chrome-linux -name chrome) --user-data-dir=/app/data/chrome_data --window-position=0,0 --window-size=%(ENV_RESOLUTION_WIDTH)s,%(ENV_RESOLUTION_HEIGHT)s --start-maximized --no-sandbox --disable-dev-shm-usage --disable-gpu --disable-software-rasterizer --disable-setuid-sandbox --no-first-run --no-default-browser-check --no-experiments --ignore-certificate-errors --remote-debugging-port=9222 --remote-debugging-address=0.0.0.0 \"$START_URL\""
+autorestart=true
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+priority=350
+startretries=5
+startsecs=10
+stopsignal=TERM
+stopwaitsecs=15
+depends_on=novnc
+[program:webui]
+command=python webui.py --ip 0.0.0.0 --port 7788
+directory=/app
+autorestart=true
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+priority=400
+startretries=3
+startsecs=3
+stopsignal=TERM
+stopwaitsecs=10
+depends_on=persistent_browser

tests/test_browser_use.py ADDED Viewed

	@@ -0,0 +1,364 @@

+import pdb
+from dotenv import load_dotenv
+load_dotenv()
+import sys
+sys.path.append(".")
+import asyncio
+import os
+import sys
+from pprint import pprint
+from browser_use import Agent
+from browser_use.agent.views import AgentHistoryList
+from src.utils import utils
+async def test_browser_use_org():
+    from browser_use.browser.browser import Browser, BrowserConfig
+    from browser_use.browser.context import (
+        BrowserContextConfig,
+        BrowserContextWindowSize,
+    )
+    # llm = utils.get_llm_model(
+    #     provider="azure_openai",
+    #     model_name="gpt-4o",
+    #     temperature=0.8,
+    #     base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
+    #     api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
+    # )
+    # llm = utils.get_llm_model(
+    #     provider="deepseek",
+    #     model_name="deepseek-chat",
+    #     temperature=0.8
+    # )
+    llm = utils.get_llm_model(
+        provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
+    )
+    window_w, window_h = 1920, 1080
+    use_vision = False
+    use_own_browser = False
+    if use_own_browser:
+        chrome_path = os.getenv("CHROME_PATH", None)
+        if chrome_path == "":
+            chrome_path = None
+    else:
+        chrome_path = None
+    tool_calling_method = "json_schema"  # setting to json_schema when using ollma
+    browser = Browser(
+        config=BrowserConfig(
+            headless=False,
+            disable_security=True,
+            chrome_instance_path=chrome_path,
+            extra_chromium_args=[f"--window-size={window_w},{window_h}"],
+        )
+    )
+    async with await browser.new_context(
+            config=BrowserContextConfig(
+                trace_path="./tmp/traces",
+                save_recording_path="./tmp/record_videos",
+                no_viewport=False,
+                browser_window_size=BrowserContextWindowSize(
+                    width=window_w, height=window_h
+                ),
+            )
+    ) as browser_context:
+        agent = Agent(
+            task="go to google.com and type 'OpenAI' click search and give me the first url",
+            llm=llm,
+            browser_context=browser_context,
+            use_vision=use_vision,
+            tool_calling_method=tool_calling_method
+        )
+        history: AgentHistoryList = await agent.run(max_steps=10)
+        print("Final Result:")
+        pprint(history.final_result(), indent=4)
+        print("\nErrors:")
+        pprint(history.errors(), indent=4)
+        # e.g. xPaths the model clicked on
+        print("\nModel Outputs:")
+        pprint(history.model_actions(), indent=4)
+        print("\nThoughts:")
+        pprint(history.model_thoughts(), indent=4)
+    # close browser
+    await browser.close()
+async def test_browser_use_custom():
+    from browser_use.browser.context import BrowserContextWindowSize
+    from browser_use.browser.browser import BrowserConfig
+    from playwright.async_api import async_playwright
+    from src.agent.custom_agent import CustomAgent
+    from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
+    from src.browser.custom_browser import CustomBrowser
+    from src.browser.custom_context import BrowserContextConfig
+    from src.controller.custom_controller import CustomController
+    window_w, window_h = 1280, 1100
+    # llm = utils.get_llm_model(
+    #     provider="openai",
+    #     model_name="gpt-4o",
+    #     temperature=0.8,
+    #     base_url=os.getenv("OPENAI_ENDPOINT", ""),
+    #     api_key=os.getenv("OPENAI_API_KEY", ""),
+    # )
+    llm = utils.get_llm_model(
+        provider="azure_openai",
+        model_name="gpt-4o",
+        temperature=0.5,
+        base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
+        api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
+    )
+    # llm = utils.get_llm_model(
+    #     provider="google",
+    #     model_name="gemini-2.0-flash",
+    #     temperature=0.6,
+    #     api_key=os.getenv("GOOGLE_API_KEY", "")
+    # )
+    # llm = utils.get_llm_model(
+    #     provider="deepseek",
+    #     model_name="deepseek-reasoner",
+    #     temperature=0.8
+    # )
+    # llm = utils.get_llm_model(
+    #     provider="deepseek",
+    #     model_name="deepseek-chat",
+    #     temperature=0.8
+    # )
+    # llm = utils.get_llm_model(
+    #     provider="ollama", model_name="qwen2.5:7b", temperature=0.5
+    # )
+    # llm = utils.get_llm_model(
+    #     provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
+    # )
+    controller = CustomController()
+    use_own_browser = True
+    disable_security = True
+    use_vision = True  # Set to False when using DeepSeek
+    max_actions_per_step = 10
+    playwright = None
+    browser = None
+    browser_context = None
+    try:
+        extra_chromium_args = [f"--window-size={window_w},{window_h}"]
+        if use_own_browser:
+            chrome_path = os.getenv("CHROME_PATH", None)
+            if chrome_path == "":
+                chrome_path = None
+            chrome_user_data = os.getenv("CHROME_USER_DATA", None)
+            if chrome_user_data:
+                extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
+        else:
+            chrome_path = None
+        browser = CustomBrowser(
+            config=BrowserConfig(
+                headless=False,
+                disable_security=disable_security,
+                chrome_instance_path=chrome_path,
+                extra_chromium_args=extra_chromium_args,
+            )
+        )
+        browser_context = await browser.new_context(
+            config=BrowserContextConfig(
+                trace_path="./tmp/traces",
+                save_recording_path="./tmp/record_videos",
+                no_viewport=False,
+                browser_window_size=BrowserContextWindowSize(
+                    width=window_w, height=window_h
+                ),
+            )
+        )
+        agent = CustomAgent(
+            task="open youtube in tab 1 , open google email in tab 2, open facebook in tab 3",
+            add_infos="",  # some hints for llm to complete the task
+            llm=llm,
+            browser=browser,
+            browser_context=browser_context,
+            controller=controller,
+            system_prompt_class=CustomSystemPrompt,
+            agent_prompt_class=CustomAgentMessagePrompt,
+            use_vision=use_vision,
+            max_actions_per_step=max_actions_per_step,
+            generate_gif=True
+        )
+        history: AgentHistoryList = await agent.run(max_steps=100)
+        print("Final Result:")
+        pprint(history.final_result(), indent=4)
+        print("\nErrors:")
+        pprint(history.errors(), indent=4)
+        # e.g. xPaths the model clicked on
+        print("\nModel Outputs:")
+        pprint(history.model_actions(), indent=4)
+        print("\nThoughts:")
+        pprint(history.model_thoughts(), indent=4)
+    except Exception:
+        import traceback
+        traceback.print_exc()
+    finally:
+        # 显式关闭持久化上下文
+        if browser_context:
+            await browser_context.close()
+        # 关闭 Playwright 对象
+        if playwright:
+            await playwright.stop()
+        if browser:
+            await browser.close()
+async def test_browser_use_parallel():
+    from browser_use.browser.context import BrowserContextWindowSize
+    from browser_use.browser.browser import BrowserConfig
+    from playwright.async_api import async_playwright
+    from browser_use.browser.browser import Browser
+    from src.agent.custom_agent import CustomAgent
+    from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
+    from src.browser.custom_browser import CustomBrowser
+    from src.browser.custom_context import BrowserContextConfig
+    from src.controller.custom_controller import CustomController
+    window_w, window_h = 1920, 1080
+    # llm = utils.get_llm_model(
+    #     provider="openai",
+    #     model_name="gpt-4o",
+    #     temperature=0.8,
+    #     base_url=os.getenv("OPENAI_ENDPOINT", ""),
+    #     api_key=os.getenv("OPENAI_API_KEY", ""),
+    # )
+    # llm = utils.get_llm_model(
+    #     provider="azure_openai",
+    #     model_name="gpt-4o",
+    #     temperature=0.8,
+    #     base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
+    #     api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
+    # )
+    llm = utils.get_llm_model(
+        provider="gemini",
+        model_name="gemini-2.0-flash-exp",
+        temperature=1.0,
+        api_key=os.getenv("GOOGLE_API_KEY", "")
+    )
+    # llm = utils.get_llm_model(
+    #     provider="deepseek",
+    #     model_name="deepseek-reasoner",
+    #     temperature=0.8
+    # )
+    # llm = utils.get_llm_model(
+    #     provider="deepseek",
+    #     model_name="deepseek-chat",
+    #     temperature=0.8
+    # )
+    # llm = utils.get_llm_model(
+    #     provider="ollama", model_name="qwen2.5:7b", temperature=0.5
+    # )
+    # llm = utils.get_llm_model(
+    #     provider="ollama", model_name="deepseek-r1:14b", temperature=0.5
+    # )
+    controller = CustomController()
+    use_own_browser = True
+    disable_security = True
+    use_vision = True  # Set to False when using DeepSeek
+    max_actions_per_step = 1
+    playwright = None
+    browser = None
+    browser_context = None
+    browser = Browser(
+        config=BrowserConfig(
+            disable_security=True,
+            headless=False,
+            new_context_config=BrowserContextConfig(save_recording_path='./tmp/recordings'),
+        )
+    )
+    try:
+        agents = [
+            Agent(task=task, llm=llm, browser=browser)
+            for task in [
+                'Search Google for weather in Tokyo',
+                'Check Reddit front page title',
+                'Find NASA image of the day',
+                'Check top story on CNN',
+                # 'Search latest SpaceX launch date',
+                # 'Look up population of Paris',
+                # 'Find current time in Sydney',
+                # 'Check who won last Super Bowl',
+                # 'Search trending topics on Twitter',
+            ]
+        ]
+        history = await asyncio.gather(*[agent.run() for agent in agents])
+        pdb.set_trace()
+        print("Final Result:")
+        pprint(history.final_result(), indent=4)
+        print("\nErrors:")
+        pprint(history.errors(), indent=4)
+        # e.g. xPaths the model clicked on
+        print("\nModel Outputs:")
+        pprint(history.model_actions(), indent=4)
+        print("\nThoughts:")
+        pprint(history.model_thoughts(), indent=4)
+        # close browser
+    except Exception:
+        import traceback
+        traceback.print_exc()
+    finally:
+        # 显式关闭持久化上下文
+        if browser_context:
+            await browser_context.close()
+        # 关闭 Playwright 对象
+        if playwright:
+            await playwright.stop()
+        if browser:
+            await browser.close()
+if __name__ == "__main__":
+    # asyncio.run(test_browser_use_org())
+    # asyncio.run(test_browser_use_parallel())
+    asyncio.run(test_browser_use_custom())

tests/test_deep_research.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import asyncio
+import os
+from dotenv import load_dotenv
+load_dotenv()
+import sys
+sys.path.append(".")
+async def test_deep_research():
+    from src.utils.deep_research import deep_research
+    from src.utils import utils
+    task = "write a report about DeepSeek-R1, get its pdf"
+    llm = utils.get_llm_model(
+        provider="gemini",
+        model_name="gemini-2.0-flash-thinking-exp-01-21",
+        temperature=1.0,
+        api_key=os.getenv("GOOGLE_API_KEY", "")
+    )
+    report_content, report_file_path = await deep_research(task=task, llm=llm, agent_state=None,
+                                                           max_search_iterations=1,
+                                                           max_query_num=3,
+                                                           use_own_browser=False)
+if __name__ == "__main__":
+    asyncio.run(test_deep_research())

tests/test_llm_api.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import os
+import pdb
+from dataclasses import dataclass
+from dotenv import load_dotenv
+from langchain_core.messages import HumanMessage, SystemMessage
+from langchain_ollama import ChatOllama
+load_dotenv()
+import sys
+sys.path.append(".")
+@dataclass
+class LLMConfig:
+    provider: str
+    model_name: str
+    temperature: float = 0.8
+    base_url: str = None
+    api_key: str = None
+def create_message_content(text, image_path=None):
+    content = [{"type": "text", "text": text}]
+    image_format = "png" if image_path and image_path.endswith(".png") else "jpeg"
+    if image_path:
+        from src.utils import utils
+        image_data = utils.encode_image(image_path)
+        content.append({
+            "type": "image_url",
+            "image_url": {"url": f"data:image/{image_format};base64,{image_data}"}
+        })
+    return content
+def get_env_value(key, provider):
+    env_mappings = {
+        "openai": {"api_key": "OPENAI_API_KEY", "base_url": "OPENAI_ENDPOINT"},
+        "azure_openai": {"api_key": "AZURE_OPENAI_API_KEY", "base_url": "AZURE_OPENAI_ENDPOINT"},
+        "google": {"api_key": "GOOGLE_API_KEY"},
+        "deepseek": {"api_key": "DEEPSEEK_API_KEY", "base_url": "DEEPSEEK_ENDPOINT"},
+        "mistral": {"api_key": "MISTRAL_API_KEY", "base_url": "MISTRAL_ENDPOINT"},
+        "alibaba": {"api_key": "ALIBABA_API_KEY", "base_url": "ALIBABA_ENDPOINT"},
+        "moonshot":{"api_key": "MOONSHOT_API_KEY", "base_url": "MOONSHOT_ENDPOINT"},
+    }
+    if provider in env_mappings and key in env_mappings[provider]:
+        return os.getenv(env_mappings[provider][key], "")
+    return ""
+def test_llm(config, query, image_path=None, system_message=None):
+    from src.utils import utils
+    # Special handling for Ollama-based models
+    if config.provider == "ollama":
+        if "deepseek-r1" in config.model_name:
+            from src.utils.llm import DeepSeekR1ChatOllama
+            llm = DeepSeekR1ChatOllama(model=config.model_name)
+        else:
+            llm = ChatOllama(model=config.model_name)
+        ai_msg = llm.invoke(query)
+        print(ai_msg.content)
+        if "deepseek-r1" in config.model_name:
+            pdb.set_trace()
+        return
+    # For other providers, use the standard configuration
+    llm = utils.get_llm_model(
+        provider=config.provider,
+        model_name=config.model_name,
+        temperature=config.temperature,
+        base_url=config.base_url or get_env_value("base_url", config.provider),
+        api_key=config.api_key or get_env_value("api_key", config.provider)
+    )
+    # Prepare messages for non-Ollama models
+    messages = []
+    if system_message:
+        messages.append(SystemMessage(content=create_message_content(system_message)))
+    messages.append(HumanMessage(content=create_message_content(query, image_path)))
+    ai_msg = llm.invoke(messages)
+    # Handle different response types
+    if hasattr(ai_msg, "reasoning_content"):
+        print(ai_msg.reasoning_content)
+    print(ai_msg.content)
+    if config.provider == "deepseek" and "deepseek-reasoner" in config.model_name:
+        print(llm.model_name)
+        pdb.set_trace()
+def test_openai_model():
+    config = LLMConfig(provider="openai", model_name="gpt-4o")
+    test_llm(config, "Describe this image", "assets/examples/test.png")
+def test_google_model():
+    # Enable your API key first if you haven't: https://ai.google.dev/palm_docs/oauth_quickstart
+    config = LLMConfig(provider="google", model_name="gemini-2.0-flash-exp")
+    test_llm(config, "Describe this image", "assets/examples/test.png")
+def test_azure_openai_model():
+    config = LLMConfig(provider="azure_openai", model_name="gpt-4o")
+    test_llm(config, "Describe this image", "assets/examples/test.png")
+def test_deepseek_model():
+    config = LLMConfig(provider="deepseek", model_name="deepseek-chat")
+    test_llm(config, "Who are you?")
+def test_deepseek_r1_model():
+    config = LLMConfig(provider="deepseek", model_name="deepseek-reasoner")
+    test_llm(config, "Which is greater, 9.11 or 9.8?", system_message="You are a helpful AI assistant.")
+def test_ollama_model():
+    config = LLMConfig(provider="ollama", model_name="qwen2.5:7b")
+    test_llm(config, "Sing a ballad of LangChain.")
+def test_deepseek_r1_ollama_model():
+    config = LLMConfig(provider="ollama", model_name="deepseek-r1:14b")
+    test_llm(config, "How many 'r's are in the word 'strawberry'?")
+def test_mistral_model():
+    config = LLMConfig(provider="mistral", model_name="pixtral-large-latest")
+    test_llm(config, "Describe this image", "assets/examples/test.png")
+def test_moonshot_model():
+    config = LLMConfig(provider="moonshot", model_name="moonshot-v1-32k-vision-preview")
+    test_llm(config, "Describe this image", "assets/examples/test.png")
+if __name__ == "__main__":
+    # test_openai_model()
+    # test_google_model()
+    # test_azure_openai_model()
+    #test_deepseek_model()
+    # test_ollama_model()
+    test_deepseek_r1_model()
+    # test_deepseek_r1_ollama_model()
+    # test_mistral_model()

tests/test_playwright.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import pdb
+from dotenv import load_dotenv
+load_dotenv()
+def test_connect_browser():
+    import os
+    from playwright.sync_api import sync_playwright
+    chrome_exe = os.getenv("CHROME_PATH", "")
+    chrome_use_data = os.getenv("CHROME_USER_DATA", "")
+    with sync_playwright() as p:
+        browser = p.chromium.launch_persistent_context(
+            user_data_dir=chrome_use_data,
+            executable_path=chrome_exe,
+            headless=False  # Keep browser window visible
+        )
+        page = browser.new_page()
+        page.goto("https://mail.google.com/mail/u/0/#inbox")
+        page.wait_for_load_state()
+        input("Press the Enter key to close the browser...")
+        browser.close()
+if __name__ == '__main__':
+    test_connect_browser()

webui.py ADDED Viewed

	@@ -0,0 +1,1203 @@

+import pdb
+import logging
+from dotenv import load_dotenv
+load_dotenv()
+import os
+import glob
+import asyncio
+import argparse
+import os
+logger = logging.getLogger(__name__)
+import gradio as gr
+import inspect
+from functools import wraps
+from browser_use.agent.service import Agent
+from playwright.async_api import async_playwright
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import (
+    BrowserContextConfig,
+    BrowserContextWindowSize,
+)
+from langchain_ollama import ChatOllama
+from playwright.async_api import async_playwright
+from src.utils.agent_state import AgentState
+from src.utils import utils
+from src.agent.custom_agent import CustomAgent
+from src.browser.custom_browser import CustomBrowser
+from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
+from src.browser.custom_context import BrowserContextConfig, CustomBrowserContext
+from src.controller.custom_controller import CustomController
+from gradio.themes import Citrus, Default, Glass, Monochrome, Ocean, Origin, Soft, Base
+from src.utils.utils import update_model_dropdown, get_latest_files, capture_screenshot, MissingAPIKeyError
+from src.utils import utils
+# Global variables for persistence
+_global_browser = None
+_global_browser_context = None
+_global_agent = None
+# Create the global agent state instance
+_global_agent_state = AgentState()
+# webui config
+webui_config_manager = utils.ConfigManager()
+def scan_and_register_components(blocks):
+    """扫描一个 Blocks 对象并注册其中的所有交互式组件，但不包括按钮"""
+    global webui_config_manager
+    def traverse_blocks(block, prefix=""):
+        registered = 0
+        # 处理 Blocks 自身的组件
+        if hasattr(block, "children"):
+            for i, child in enumerate(block.children):
+                if isinstance(child, gr.components.Component):
+                    # 排除按钮 (Button) 组件
+                    if getattr(child, "interactive", False) and not isinstance(child, gr.Button):
+                        name = f"{prefix}component_{i}"
+                        if hasattr(child, "label") and child.label:
+                            # 使用标签作为名称的一部分
+                            label = child.label
+                            name = f"{prefix}{label}"
+                        logger.debug(f"Registering component: {name}")
+                        webui_config_manager.register_component(name, child)
+                        registered += 1
+                elif hasattr(child, "children"):
+                    # 递归处理嵌套的 Blocks
+                    new_prefix = f"{prefix}block_{i}_"
+                    registered += traverse_blocks(child, new_prefix)
+        return registered
+    total = traverse_blocks(blocks)
+    logger.info(f"Total registered components: {total}")
+def save_current_config():
+    return webui_config_manager.save_current_config()
+def update_ui_from_config(config_file):
+    return webui_config_manager.update_ui_from_config(config_file)
+def resolve_sensitive_env_variables(text):
+    """
+    Replace environment variable placeholders ($SENSITIVE_*) with their values.
+    Only replaces variables that start with SENSITIVE_.
+    """
+    if not text:
+        return text
+    import re
+    # Find all $SENSITIVE_* patterns
+    env_vars = re.findall(r'\$SENSITIVE_[A-Za-z0-9_]*', text)
+    result = text
+    for var in env_vars:
+        # Remove the $ prefix to get the actual environment variable name
+        env_name = var[1:]  # removes the $
+        env_value = os.getenv(env_name)
+        if env_value is not None:
+            # Replace $SENSITIVE_VAR_NAME with its value
+            result = result.replace(var, env_value)
+    return result
+async def stop_agent():
+    """Request the agent to stop and update UI with enhanced feedback"""
+    global _global_agent
+    try:
+        if _global_agent is not None:
+            # Request stop
+            _global_agent.stop()
+        # Update UI immediately
+        message = "Stop requested - the agent will halt at the next safe point"
+        logger.info(f"🛑 {message}")
+        # Return UI updates
+        return (
+            gr.update(value="Stopping...", interactive=False),  # stop_button
+            gr.update(interactive=False),  # run_button
+        )
+    except Exception as e:
+        error_msg = f"Error during stop: {str(e)}"
+        logger.error(error_msg)
+        return (
+            gr.update(value="Stop", interactive=True),
+            gr.update(interactive=True)
+        )
+async def stop_research_agent():
+    """Request the agent to stop and update UI with enhanced feedback"""
+    global _global_agent_state
+    try:
+        # Request stop
+        _global_agent_state.request_stop()
+        # Update UI immediately
+        message = "Stop requested - the agent will halt at the next safe point"
+        logger.info(f"🛑 {message}")
+        # Return UI updates
+        return (  # errors_output
+            gr.update(value="Stopping...", interactive=False),  # stop_button
+            gr.update(interactive=False),  # run_button
+        )
+    except Exception as e:
+        error_msg = f"Error during stop: {str(e)}"
+        logger.error(error_msg)
+        return (
+            gr.update(value="Stop", interactive=True),
+            gr.update(interactive=True)
+        )
+async def run_browser_agent(
+        agent_type,
+        llm_provider,
+        llm_model_name,
+        llm_num_ctx,
+        llm_temperature,
+        llm_base_url,
+        llm_api_key,
+        use_own_browser,
+        keep_browser_open,
+        headless,
+        disable_security,
+        window_w,
+        window_h,
+        save_recording_path,
+        save_agent_history_path,
+        save_trace_path,
+        enable_recording,
+        task,
+        add_infos,
+        max_steps,
+        use_vision,
+        max_actions_per_step,
+        tool_calling_method,
+        chrome_cdp,
+        max_input_tokens
+):
+    try:
+        # Disable recording if the checkbox is unchecked
+        if not enable_recording:
+            save_recording_path = None
+        # Ensure the recording directory exists if recording is enabled
+        if save_recording_path:
+            os.makedirs(save_recording_path, exist_ok=True)
+        # Get the list of existing videos before the agent runs
+        existing_videos = set()
+        if save_recording_path:
+            existing_videos = set(
+                glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4"))
+                + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]"))
+            )
+        task = resolve_sensitive_env_variables(task)
+        # Run the agent
+        llm = utils.get_llm_model(
+            provider=llm_provider,
+            model_name=llm_model_name,
+            num_ctx=llm_num_ctx,
+            temperature=llm_temperature,
+            base_url=llm_base_url,
+            api_key=llm_api_key,
+        )
+        if agent_type == "org":
+            final_result, errors, model_actions, model_thoughts, trace_file, history_file = await run_org_agent(
+                llm=llm,
+                use_own_browser=use_own_browser,
+                keep_browser_open=keep_browser_open,
+                headless=headless,
+                disable_security=disable_security,
+                window_w=window_w,
+                window_h=window_h,
+                save_recording_path=save_recording_path,
+                save_agent_history_path=save_agent_history_path,
+                save_trace_path=save_trace_path,
+                task=task,
+                max_steps=max_steps,
+                use_vision=use_vision,
+                max_actions_per_step=max_actions_per_step,
+                tool_calling_method=tool_calling_method,
+                chrome_cdp=chrome_cdp,
+                max_input_tokens=max_input_tokens
+            )
+        elif agent_type == "custom":
+            final_result, errors, model_actions, model_thoughts, trace_file, history_file = await run_custom_agent(
+                llm=llm,
+                use_own_browser=use_own_browser,
+                keep_browser_open=keep_browser_open,
+                headless=headless,
+                disable_security=disable_security,
+                window_w=window_w,
+                window_h=window_h,
+                save_recording_path=save_recording_path,
+                save_agent_history_path=save_agent_history_path,
+                save_trace_path=save_trace_path,
+                task=task,
+                add_infos=add_infos,
+                max_steps=max_steps,
+                use_vision=use_vision,
+                max_actions_per_step=max_actions_per_step,
+                tool_calling_method=tool_calling_method,
+                chrome_cdp=chrome_cdp,
+                max_input_tokens=max_input_tokens
+            )
+        else:
+            raise ValueError(f"Invalid agent type: {agent_type}")
+        # Get the list of videos after the agent runs (if recording is enabled)
+        # latest_video = None
+        # if save_recording_path:
+        #     new_videos = set(
+        #         glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4"))
+        #         + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]"))
+        #     )
+        #     if new_videos - existing_videos:
+        #         latest_video = list(new_videos - existing_videos)[0]  # Get the first new video
+        gif_path = os.path.join(os.path.dirname(__file__), "agent_history.gif")
+        return (
+            final_result,
+            errors,
+            model_actions,
+            model_thoughts,
+            gif_path,
+            trace_file,
+            history_file,
+            gr.update(value="Stop", interactive=True),  # Re-enable stop button
+            gr.update(interactive=True)  # Re-enable run button
+        )
+    except MissingAPIKeyError as e:
+        logger.error(str(e))
+        raise gr.Error(str(e), print_exception=False)
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        errors = str(e) + "\n" + traceback.format_exc()
+        return (
+            '',  # final_result
+            errors,  # errors
+            '',  # model_actions
+            '',  # model_thoughts
+            None,  # latest_video
+            None,  # history_file
+            None,  # trace_file
+            gr.update(value="Stop", interactive=True),  # Re-enable stop button
+            gr.update(interactive=True)  # Re-enable run button
+        )
+async def run_org_agent(
+        llm,
+        use_own_browser,
+        keep_browser_open,
+        headless,
+        disable_security,
+        window_w,
+        window_h,
+        save_recording_path,
+        save_agent_history_path,
+        save_trace_path,
+        task,
+        max_steps,
+        use_vision,
+        max_actions_per_step,
+        tool_calling_method,
+        chrome_cdp,
+        max_input_tokens
+):
+    try:
+        global _global_browser, _global_browser_context, _global_agent
+        extra_chromium_args = ["--accept_downloads=True", f"--window-size={window_w},{window_h}"]
+        cdp_url = chrome_cdp
+        if use_own_browser:
+            cdp_url = os.getenv("CHROME_CDP", chrome_cdp)
+            chrome_path = os.getenv("CHROME_PATH", None)
+            if chrome_path == "":
+                chrome_path = None
+            chrome_user_data = os.getenv("CHROME_USER_DATA", None)
+            if chrome_user_data:
+                extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
+        else:
+            chrome_path = None
+        if _global_browser is None:
+            _global_browser = Browser(
+                config=BrowserConfig(
+                    headless=headless,
+                    cdp_url=cdp_url,
+                    disable_security=disable_security,
+                    chrome_instance_path=chrome_path,
+                    extra_chromium_args=extra_chromium_args,
+                )
+            )
+        if _global_browser_context is None:
+            _global_browser_context = await _global_browser.new_context(
+                config=BrowserContextConfig(
+                    trace_path=save_trace_path if save_trace_path else None,
+                    save_recording_path=save_recording_path if save_recording_path else None,
+                    save_downloads_path="./tmp/downloads",
+                    no_viewport=False,
+                    browser_window_size=BrowserContextWindowSize(
+                        width=window_w, height=window_h
+                    ),
+                )
+            )
+        if _global_agent is None:
+            _global_agent = Agent(
+                task=task,
+                llm=llm,
+                use_vision=use_vision,
+                browser=_global_browser,
+                browser_context=_global_browser_context,
+                max_actions_per_step=max_actions_per_step,
+                tool_calling_method=tool_calling_method,
+                max_input_tokens=max_input_tokens,
+                generate_gif=True
+            )
+        history = await _global_agent.run(max_steps=max_steps)
+        history_file = os.path.join(save_agent_history_path, f"{_global_agent.state.agent_id}.json")
+        _global_agent.save_history(history_file)
+        final_result = history.final_result()
+        errors = history.errors()
+        model_actions = history.model_actions()
+        model_thoughts = history.model_thoughts()
+        trace_file = get_latest_files(save_trace_path)
+        return final_result, errors, model_actions, model_thoughts, trace_file.get('.zip'), history_file
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        errors = str(e) + "\n" + traceback.format_exc()
+        return '', errors, '', '', None, None
+    finally:
+        _global_agent = None
+        # Handle cleanup based on persistence configuration
+        if not keep_browser_open:
+            if _global_browser_context:
+                await _global_browser_context.close()
+                _global_browser_context = None
+            if _global_browser:
+                await _global_browser.close()
+                _global_browser = None
+async def run_custom_agent(
+        llm,
+        use_own_browser,
+        keep_browser_open,
+        headless,
+        disable_security,
+        window_w,
+        window_h,
+        save_recording_path,
+        save_agent_history_path,
+        save_trace_path,
+        task,
+        add_infos,
+        max_steps,
+        use_vision,
+        max_actions_per_step,
+        tool_calling_method,
+        chrome_cdp,
+        max_input_tokens
+):
+    try:
+        global _global_browser, _global_browser_context, _global_agent
+        extra_chromium_args = ["--accept_downloads=True", f"--window-size={window_w},{window_h}"]
+        cdp_url = chrome_cdp
+        if use_own_browser:
+            cdp_url = os.getenv("CHROME_CDP", chrome_cdp)
+            chrome_path = os.getenv("CHROME_PATH", None)
+            if chrome_path == "":
+                chrome_path = None
+            chrome_user_data = os.getenv("CHROME_USER_DATA", None)
+            if chrome_user_data:
+                extra_chromium_args += [f"--user-data-dir={chrome_user_data}"]
+        else:
+            chrome_path = None
+        controller = CustomController()
+        # Initialize global browser if needed
+        # if chrome_cdp not empty string nor None
+        if (_global_browser is None) or (cdp_url and cdp_url != "" and cdp_url != None):
+            _global_browser = CustomBrowser(
+                config=BrowserConfig(
+                    headless=headless,
+                    disable_security=disable_security,
+                    cdp_url=cdp_url,
+                    chrome_instance_path=chrome_path,
+                    extra_chromium_args=extra_chromium_args,
+                )
+            )
+        if _global_browser_context is None or (chrome_cdp and cdp_url != "" and cdp_url != None):
+            _global_browser_context = await _global_browser.new_context(
+                config=BrowserContextConfig(
+                    trace_path=save_trace_path if save_trace_path else None,
+                    save_recording_path=save_recording_path if save_recording_path else None,
+                    no_viewport=False,
+                    save_downloads_path="./tmp/downloads",
+                    browser_window_size=BrowserContextWindowSize(
+                        width=window_w, height=window_h
+                    ),
+                )
+            )
+        # Create and run agent
+        if _global_agent is None:
+            _global_agent = CustomAgent(
+                task=task,
+                add_infos=add_infos,
+                use_vision=use_vision,
+                llm=llm,
+                browser=_global_browser,
+                browser_context=_global_browser_context,
+                controller=controller,
+                system_prompt_class=CustomSystemPrompt,
+                agent_prompt_class=CustomAgentMessagePrompt,
+                max_actions_per_step=max_actions_per_step,
+                tool_calling_method=tool_calling_method,
+                max_input_tokens=max_input_tokens,
+                generate_gif=True
+            )
+        history = await _global_agent.run(max_steps=max_steps)
+        history_file = os.path.join(save_agent_history_path, f"{_global_agent.state.agent_id}.json")
+        _global_agent.save_history(history_file)
+        final_result = history.final_result()
+        errors = history.errors()
+        model_actions = history.model_actions()
+        model_thoughts = history.model_thoughts()
+        trace_file = get_latest_files(save_trace_path)
+        return final_result, errors, model_actions, model_thoughts, trace_file.get('.zip'), history_file
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        errors = str(e) + "\n" + traceback.format_exc()
+        return '', errors, '', '', None, None
+    finally:
+        _global_agent = None
+        # Handle cleanup based on persistence configuration
+        if not keep_browser_open:
+            if _global_browser_context:
+                await _global_browser_context.close()
+                _global_browser_context = None
+            if _global_browser:
+                await _global_browser.close()
+                _global_browser = None
+async def run_with_stream(
+        agent_type,
+        llm_provider,
+        llm_model_name,
+        llm_num_ctx,
+        llm_temperature,
+        llm_base_url,
+        llm_api_key,
+        use_own_browser,
+        keep_browser_open,
+        headless,
+        disable_security,
+        window_w,
+        window_h,
+        save_recording_path,
+        save_agent_history_path,
+        save_trace_path,
+        enable_recording,
+        task,
+        add_infos,
+        max_steps,
+        use_vision,
+        max_actions_per_step,
+        tool_calling_method,
+        chrome_cdp,
+        max_input_tokens
+):
+    global _global_agent
+    stream_vw = 80
+    stream_vh = int(80 * window_h // window_w)
+    if not headless:
+        result = await run_browser_agent(
+            agent_type=agent_type,
+            llm_provider=llm_provider,
+            llm_model_name=llm_model_name,
+            llm_num_ctx=llm_num_ctx,
+            llm_temperature=llm_temperature,
+            llm_base_url=llm_base_url,
+            llm_api_key=llm_api_key,
+            use_own_browser=use_own_browser,
+            keep_browser_open=keep_browser_open,
+            headless=headless,
+            disable_security=disable_security,
+            window_w=window_w,
+            window_h=window_h,
+            save_recording_path=save_recording_path,
+            save_agent_history_path=save_agent_history_path,
+            save_trace_path=save_trace_path,
+            enable_recording=enable_recording,
+            task=task,
+            add_infos=add_infos,
+            max_steps=max_steps,
+            use_vision=use_vision,
+            max_actions_per_step=max_actions_per_step,
+            tool_calling_method=tool_calling_method,
+            chrome_cdp=chrome_cdp,
+            max_input_tokens=max_input_tokens
+        )
+        # Add HTML content at the start of the result array
+        yield [gr.update(visible=False)] + list(result)
+    else:
+        try:
+            # Run the browser agent in the background
+            agent_task = asyncio.create_task(
+                run_browser_agent(
+                    agent_type=agent_type,
+                    llm_provider=llm_provider,
+                    llm_model_name=llm_model_name,
+                    llm_num_ctx=llm_num_ctx,
+                    llm_temperature=llm_temperature,
+                    llm_base_url=llm_base_url,
+                    llm_api_key=llm_api_key,
+                    use_own_browser=use_own_browser,
+                    keep_browser_open=keep_browser_open,
+                    headless=headless,
+                    disable_security=disable_security,
+                    window_w=window_w,
+                    window_h=window_h,
+                    save_recording_path=save_recording_path,
+                    save_agent_history_path=save_agent_history_path,
+                    save_trace_path=save_trace_path,
+                    enable_recording=enable_recording,
+                    task=task,
+                    add_infos=add_infos,
+                    max_steps=max_steps,
+                    use_vision=use_vision,
+                    max_actions_per_step=max_actions_per_step,
+                    tool_calling_method=tool_calling_method,
+                    chrome_cdp=chrome_cdp,
+                    max_input_tokens=max_input_tokens
+                )
+            )
+            # Initialize values for streaming
+            html_content = f"<h1 style='width:{stream_vw}vw; height:{stream_vh}vh'>Using browser...</h1>"
+            final_result = errors = model_actions = model_thoughts = ""
+            recording_gif = trace = history_file = None
+            # Periodically update the stream while the agent task is running
+            while not agent_task.done():
+                try:
+                    encoded_screenshot = await capture_screenshot(_global_browser_context)
+                    if encoded_screenshot is not None:
+                        html_content = f'<img src="data:image/jpeg;base64,{encoded_screenshot}" style="width:{stream_vw}vw; height:{stream_vh}vh ; border:1px solid #ccc;">'
+                    else:
+                        html_content = f"<h1 style='width:{stream_vw}vw; height:{stream_vh}vh'>Waiting for browser session...</h1>"
+                except Exception as e:
+                    html_content = f"<h1 style='width:{stream_vw}vw; height:{stream_vh}vh'>Waiting for browser session...</h1>"
+                if _global_agent and _global_agent.state.stopped:
+                    yield [
+                        gr.HTML(value=html_content, visible=True),
+                        final_result,
+                        errors,
+                        model_actions,
+                        model_thoughts,
+                        recording_gif,
+                        trace,
+                        history_file,
+                        gr.update(value="Stopping...", interactive=False),  # stop_button
+                        gr.update(interactive=False),  # run_button
+                    ]
+                    break
+                else:
+                    yield [
+                        gr.HTML(value=html_content, visible=True),
+                        final_result,
+                        errors,
+                        model_actions,
+                        model_thoughts,
+                        recording_gif,
+                        trace,
+                        history_file,
+                        gr.update(),  # Re-enable stop button
+                        gr.update()  # Re-enable run button
+                    ]
+                await asyncio.sleep(0.1)
+            # Once the agent task completes, get the results
+            try:
+                result = await agent_task
+                final_result, errors, model_actions, model_thoughts, recording_gif, trace, history_file, stop_button, run_button = result
+            except gr.Error:
+                final_result = ""
+                model_actions = ""
+                model_thoughts = ""
+                recording_gif = trace = history_file = None
+            except Exception as e:
+                errors = f"Agent error: {str(e)}"
+            yield [
+                gr.HTML(value=html_content, visible=True),
+                final_result,
+                errors,
+                model_actions,
+                model_thoughts,
+                recording_gif,
+                trace,
+                history_file,
+                stop_button,
+                run_button
+            ]
+        except Exception as e:
+            import traceback
+            yield [
+                gr.HTML(
+                    value=f"<h1 style='width:{stream_vw}vw; height:{stream_vh}vh'>Waiting for browser session...</h1>",
+                    visible=True),
+                "",
+                f"Error: {str(e)}\n{traceback.format_exc()}",
+                "",
+                "",
+                None,
+                None,
+                None,
+                gr.update(value="Stop", interactive=True),  # Re-enable stop button
+                gr.update(interactive=True)  # Re-enable run button
+            ]
+# Define the theme map globally
+theme_map = {
+    "Default": Default(),
+    "Soft": Soft(),
+    "Monochrome": Monochrome(),
+    "Glass": Glass(),
+    "Origin": Origin(),
+    "Citrus": Citrus(),
+    "Ocean": Ocean(),
+    "Base": Base()
+}
+async def close_global_browser():
+    global _global_browser, _global_browser_context
+    if _global_browser_context:
+        await _global_browser_context.close()
+        _global_browser_context = None
+    if _global_browser:
+        await _global_browser.close()
+        _global_browser = None
+async def run_deep_search(research_task, max_search_iteration_input, max_query_per_iter_input, llm_provider,
+                          llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, use_vision,
+                          use_own_browser, headless, chrome_cdp):
+    from src.utils.deep_research import deep_research
+    global _global_agent_state
+    # Clear any previous stop request
+    _global_agent_state.clear_stop()
+    llm = utils.get_llm_model(
+        provider=llm_provider,
+        model_name=llm_model_name,
+        num_ctx=llm_num_ctx,
+        temperature=llm_temperature,
+        base_url=llm_base_url,
+        api_key=llm_api_key,
+    )
+    markdown_content, file_path = await deep_research(research_task, llm, _global_agent_state,
+                                                      max_search_iterations=max_search_iteration_input,
+                                                      max_query_num=max_query_per_iter_input,
+                                                      use_vision=use_vision,
+                                                      headless=headless,
+                                                      use_own_browser=use_own_browser,
+                                                      chrome_cdp=chrome_cdp
+                                                      )
+    return markdown_content, file_path, gr.update(value="Stop", interactive=True), gr.update(interactive=True)
+def create_ui(theme_name="Ocean"):
+    css = """
+    .gradio-container {
+        width: 60vw !important;
+        max-width: 60% !important;
+        margin-left: auto !important;
+        margin-right: auto !important;
+        padding-top: 20px !important;
+    }
+    .header-text {
+        text-align: center;
+        margin-bottom: 30px;
+    }
+    .theme-section {
+        margin-bottom: 20px;
+        padding: 15px;
+        border-radius: 10px;
+    }
+    """
+    with gr.Blocks(
+            title="Browser Use WebUI", theme=theme_map[theme_name], css=css
+    ) as demo:
+        with gr.Row():
+            gr.Markdown(
+                """
+                # 🌐 Browser Use WebUI
+                ### Control your browser with AI assistance
+                """,
+                elem_classes=["header-text"],
+            )
+        with gr.Tabs() as tabs:
+            with gr.TabItem("⚙️ Agent Settings", id=1):
+                with gr.Group():
+                    agent_type = gr.Radio(
+                        ["org", "custom"],
+                        label="Agent Type",
+                        value="custom",
+                        info="Select the type of agent to use",
+                        interactive=True
+                    )
+                    with gr.Column():
+                        max_steps = gr.Slider(
+                            minimum=1,
+                            maximum=200,
+                            value=100,
+                            step=1,
+                            label="Max Run Steps",
+                            info="Maximum number of steps the agent will take",
+                            interactive=True
+                        )
+                        max_actions_per_step = gr.Slider(
+                            minimum=1,
+                            maximum=100,
+                            value=10,
+                            step=1,
+                            label="Max Actions per Step",
+                            info="Maximum number of actions the agent will take per step",
+                            interactive=True
+                        )
+                    with gr.Column():
+                        use_vision = gr.Checkbox(
+                            label="Use Vision",
+                            value=True,
+                            info="Enable visual processing capabilities",
+                            interactive=True
+                        )
+                        max_input_tokens = gr.Number(
+                            label="Max Input Tokens",
+                            value=128000,
+                            precision=0,
+                            interactive=True
+                        )
+                        tool_calling_method = gr.Dropdown(
+                            label="Tool Calling Method",
+                            value="auto",
+                            interactive=True,
+                            allow_custom_value=True,  # Allow users to input custom model names
+                            choices=["auto", "json_schema", "function_calling"],
+                            info="Tool Calls Funtion Name",
+                            visible=False
+                        )
+            with gr.TabItem("🔧 LLM Settings", id=2):
+                with gr.Group():
+                    llm_provider = gr.Dropdown(
+                        choices=[provider for provider, model in utils.model_names.items()],
+                        label="LLM Provider",
+                        value="openai",
+                        info="Select your preferred language model provider",
+                        interactive=True
+                    )
+                    llm_model_name = gr.Dropdown(
+                        label="Model Name",
+                        choices=utils.model_names['openai'],
+                        value="gpt-4o",
+                        interactive=True,
+                        allow_custom_value=True,  # Allow users to input custom model names
+                        info="Select a model in the dropdown options or directly type a custom model name"
+                    )
+                    ollama_num_ctx = gr.Slider(
+                        minimum=2 ** 8,
+                        maximum=2 ** 16,
+                        value=16000,
+                        step=1,
+                        label="Ollama Context Length",
+                        info="Controls max context length model needs to handle (less = faster)",
+                        visible=False,
+                        interactive=True
+                    )
+                    llm_temperature = gr.Slider(
+                        minimum=0.0,
+                        maximum=2.0,
+                        value=0.6,
+                        step=0.1,
+                        label="Temperature",
+                        info="Controls randomness in model outputs",
+                        interactive=True
+                    )
+                    with gr.Row():
+                        llm_base_url = gr.Textbox(
+                            label="Base URL",
+                            value="",
+                            info="API endpoint URL (if required)"
+                        )
+                        llm_api_key = gr.Textbox(
+                            label="API Key",
+                            type="password",
+                            value="",
+                            info="Your API key (leave blank to use .env)"
+                        )
+            # Change event to update context length slider
+            def update_llm_num_ctx_visibility(llm_provider):
+                return gr.update(visible=llm_provider == "ollama")
+            # Bind the change event of llm_provider to update the visibility of context length slider
+            llm_provider.change(
+                fn=update_llm_num_ctx_visibility,
+                inputs=llm_provider,
+                outputs=ollama_num_ctx
+            )
+            with gr.TabItem("🌐 Browser Settings", id=3):
+                with gr.Group():
+                    with gr.Row():
+                        use_own_browser = gr.Checkbox(
+                            label="Use Own Browser",
+                            value=False,
+                            info="Use your existing browser instance",
+                            interactive=True
+                        )
+                        keep_browser_open = gr.Checkbox(
+                            label="Keep Browser Open",
+                            value=False,
+                            info="Keep Browser Open between Tasks",
+                            interactive=True
+                        )
+                        headless = gr.Checkbox(
+                            label="Headless Mode",
+                            value=False,
+                            info="Run browser without GUI",
+                            interactive=True
+                        )
+                        disable_security = gr.Checkbox(
+                            label="Disable Security",
+                            value=True,
+                            info="Disable browser security features",
+                            interactive=True
+                        )
+                        enable_recording = gr.Checkbox(
+                            label="Enable Recording",
+                            value=True,
+                            info="Enable saving browser recordings",
+                            interactive=True
+                        )
+                    with gr.Row():
+                        window_w = gr.Number(
+                            label="Window Width",
+                            value=1280,
+                            info="Browser window width",
+                            interactive=True
+                        )
+                        window_h = gr.Number(
+                            label="Window Height",
+                            value=1100,
+                            info="Browser window height",
+                            interactive=True
+                        )
+                    chrome_cdp = gr.Textbox(
+                        label="CDP URL",
+                        placeholder="http://localhost:9222",
+                        value="",
+                        info="CDP for google remote debugging",
+                        interactive=True,  # Allow editing only if recording is enabled
+                    )
+                    save_recording_path = gr.Textbox(
+                        label="Recording Path",
+                        placeholder="e.g. ./tmp/record_videos",
+                        value="./tmp/record_videos",
+                        info="Path to save browser recordings",
+                        interactive=True,  # Allow editing only if recording is enabled
+                    )
+                    save_trace_path = gr.Textbox(
+                        label="Trace Path",
+                        placeholder="e.g. ./tmp/traces",
+                        value="./tmp/traces",
+                        info="Path to save Agent traces",
+                        interactive=True,
+                    )
+                    save_agent_history_path = gr.Textbox(
+                        label="Agent History Save Path",
+                        placeholder="e.g., ./tmp/agent_history",
+                        value="./tmp/agent_history",
+                        info="Specify the directory where agent history should be saved.",
+                        interactive=True,
+                    )
+            with gr.TabItem("🤖 Run Agent", id=4):
+                task = gr.Textbox(
+                    label="Task Description",
+                    lines=4,
+                    placeholder="Enter your task here...",
+                    value="go to google.com and type 'OpenAI' click search and give me the first url",
+                    info="Describe what you want the agent to do",
+                    interactive=True
+                )
+                add_infos = gr.Textbox(
+                    label="Additional Information",
+                    lines=3,
+                    placeholder="Add any helpful context or instructions...",
+                    info="Optional hints to help the LLM complete the task",
+                    value="",
+                    interactive=True
+                )
+                with gr.Row():
+                    run_button = gr.Button("▶️ Run Agent", variant="primary", scale=2)
+                    stop_button = gr.Button("⏹️ Stop", variant="stop", scale=1)
+                with gr.Row():
+                    browser_view = gr.HTML(
+                        value="<h1 style='width:80vw; height:50vh'>Waiting for browser session...</h1>",
+                        label="Live Browser View",
+                        visible=False
+                    )
+                gr.Markdown("### Results")
+                with gr.Row():
+                    with gr.Column():
+                        final_result_output = gr.Textbox(
+                            label="Final Result", lines=3, show_label=True
+                        )
+                    with gr.Column():
+                        errors_output = gr.Textbox(
+                            label="Errors", lines=3, show_label=True
+                        )
+                with gr.Row():
+                    with gr.Column():
+                        model_actions_output = gr.Textbox(
+                            label="Model Actions", lines=3, show_label=True, visible=False
+                        )
+                    with gr.Column():
+                        model_thoughts_output = gr.Textbox(
+                            label="Model Thoughts", lines=3, show_label=True, visible=False
+                        )
+                recording_gif = gr.Image(label="Result GIF", format="gif")
+                trace_file = gr.File(label="Trace File")
+                agent_history_file = gr.File(label="Agent History")
+            with gr.TabItem("🧐 Deep Research", id=5):
+                research_task_input = gr.Textbox(label="Research Task", lines=5,
+                                                 value="Compose a report on the use of Reinforcement Learning for training Large Language Models, encompassing its origins, current advancements, and future prospects, substantiated with examples of relevant models and techniques. The report should reflect original insights and analysis, moving beyond mere summarization of existing literature.",
+                                                 interactive=True)
+                with gr.Row():
+                    max_search_iteration_input = gr.Number(label="Max Search Iteration", value=3,
+                                                           precision=0,
+                                                           interactive=True)  # precision=0 确保是整数
+                    max_query_per_iter_input = gr.Number(label="Max Query per Iteration", value=1,
+                                                         precision=0,
+                                                         interactive=True)  # precision=0 确保是整数
+                with gr.Row():
+                    research_button = gr.Button("▶️ Run Deep Research", variant="primary", scale=2)
+                    stop_research_button = gr.Button("⏹ Stop", variant="stop", scale=1)
+                markdown_output_display = gr.Markdown(label="Research Report")
+                markdown_download = gr.File(label="Download Research Report")
+            # Bind the stop button click event after errors_output is defined
+            stop_button.click(
+                fn=stop_agent,
+                inputs=[],
+                outputs=[stop_button, run_button],
+            )
+            # Run button click handler
+            run_button.click(
+                fn=run_with_stream,
+                inputs=[
+                    agent_type, llm_provider, llm_model_name, ollama_num_ctx, llm_temperature, llm_base_url,
+                    llm_api_key,
+                    use_own_browser, keep_browser_open, headless, disable_security, window_w, window_h,
+                    save_recording_path, save_agent_history_path, save_trace_path,  # Include the new path
+                    enable_recording, task, add_infos, max_steps, use_vision, max_actions_per_step,
+                    tool_calling_method, chrome_cdp, max_input_tokens
+                ],
+                outputs=[
+                    browser_view,  # Browser view
+                    final_result_output,  # Final result
+                    errors_output,  # Errors
+                    model_actions_output,  # Model actions
+                    model_thoughts_output,  # Model thoughts
+                    recording_gif,  # Latest recording
+                    trace_file,  # Trace file
+                    agent_history_file,  # Agent history file
+                    stop_button,  # Stop button
+                    run_button  # Run button
+                ],
+            )
+            # Run Deep Research
+            research_button.click(
+                fn=run_deep_search,
+                inputs=[research_task_input, max_search_iteration_input, max_query_per_iter_input, llm_provider,
+                        llm_model_name, ollama_num_ctx, llm_temperature, llm_base_url, llm_api_key, use_vision,
+                        use_own_browser, headless, chrome_cdp],
+                outputs=[markdown_output_display, markdown_download, stop_research_button, research_button]
+            )
+            # Bind the stop button click event after errors_output is defined
+            stop_research_button.click(
+                fn=stop_research_agent,
+                inputs=[],
+                outputs=[stop_research_button, research_button],
+            )
+            with gr.TabItem("🎥 Recordings", id=7, visible=True):
+                def list_recordings(save_recording_path):
+                    if not os.path.exists(save_recording_path):
+                        return []
+                    # Get all video files
+                    recordings = glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4")) + glob.glob(
+                        os.path.join(save_recording_path, "*.[wW][eE][bB][mM]"))
+                    # Sort recordings by creation time (oldest first)
+                    recordings.sort(key=os.path.getctime)
+                    # Add numbering to the recordings
+                    numbered_recordings = []
+                    for idx, recording in enumerate(recordings, start=1):
+                        filename = os.path.basename(recording)
+                        numbered_recordings.append((recording, f"{idx}. {filename}"))
+                    return numbered_recordings
+                recordings_gallery = gr.Gallery(
+                    label="Recordings",
+                    columns=3,
+                    height="auto",
+                    object_fit="contain"
+                )
+                refresh_button = gr.Button("🔄 Refresh Recordings", variant="secondary")
+                refresh_button.click(
+                    fn=list_recordings,
+                    inputs=save_recording_path,
+                    outputs=recordings_gallery
+                )
+            with gr.TabItem("📁 UI Configuration", id=8):
+                config_file_input = gr.File(
+                    label="Load UI Settings from Config File",
+                    file_types=[".json"],
+                    interactive=True
+                )
+                with gr.Row():
+                    load_config_button = gr.Button("Load Config", variant="primary")
+                    save_config_button = gr.Button("Save UI Settings", variant="primary")
+                config_status = gr.Textbox(
+                    label="Status",
+                    lines=2,
+                    interactive=False
+                )
+                save_config_button.click(
+                    fn=save_current_config,
+                    inputs=[],  # 不需要输入参数
+                    outputs=[config_status]
+                )
+        # Attach the callback to the LLM provider dropdown
+        llm_provider.change(
+            lambda provider, api_key, base_url: update_model_dropdown(provider, api_key, base_url),
+            inputs=[llm_provider, llm_api_key, llm_base_url],
+            outputs=llm_model_name
+        )
+        # Add this after defining the components
+        enable_recording.change(
+            lambda enabled: gr.update(interactive=enabled),
+            inputs=enable_recording,
+            outputs=save_recording_path
+        )
+        use_own_browser.change(fn=close_global_browser)
+        keep_browser_open.change(fn=close_global_browser)
+        scan_and_register_components(demo)
+        global webui_config_manager
+        all_components = webui_config_manager.get_all_components()
+        load_config_button.click(
+            fn=update_ui_from_config,
+            inputs=[config_file_input],
+            outputs=all_components + [config_status]
+        )
+    return demo
+def main():
+    parser = argparse.ArgumentParser(description="Gradio UI for Browser Agent")
+    parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to bind to")
+    parser.add_argument("--port", type=int, default=7788, help="Port to listen on")
+    parser.add_argument("--theme", type=str, default="Ocean", choices=theme_map.keys(), help="Theme to use for the UI")
+    args = parser.parse_args()
+    demo = create_ui(theme_name=args.theme)
+    demo.launch(server_name="0.0.0.0",share=True, server_port=int(os.environ.get("PORT", 80)))
+if __name__ == '__main__':
+    main()