Spaces:

soiz1
/

Whisper-WebUI

Running

App Files Files Community

soiz1 commited on Jan 25

Commit

9aaf513

verified ·

1 Parent(s): f201737

Upload 109 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +10 -0
.github/FUNDING.yml +13 -0
.github/ISSUE_TEMPLATE/bug_report.md +11 -0
.github/ISSUE_TEMPLATE/feature_request.md +10 -0
.github/ISSUE_TEMPLATE/hallucination.md +12 -0
.github/pull_request_template.md +5 -0
.github/workflows/ci.yml +101 -0
.github/workflows/publish-docker.yml +73 -0
.gitignore +13 -0
Dockerfile +34 -0
Install.bat +21 -0
Install.sh +18 -0
LICENSE +201 -0
README.md +134 -12
app.py +368 -0
backend/Dockerfile +36 -0
backend/README.md +110 -0
backend/__init__.py +0 -0
backend/cache/cached_files_are_generated_here +0 -0
backend/common/audio.py +36 -0
backend/common/cache_manager.py +21 -0
backend/common/compresser.py +58 -0
backend/common/config_loader.py +25 -0
backend/common/models.py +14 -0
backend/configs/config.yaml +23 -0
backend/db/__init__.py +0 -0
backend/db/db_instance.py +42 -0
backend/db/task/__init__.py +0 -0
backend/db/task/dao.py +94 -0
backend/db/task/models.py +174 -0
backend/docker-compose.yaml +33 -0
backend/main.py +92 -0
backend/nginx/logs/logs_are_generated_here +0 -0
backend/nginx/nginx.conf +23 -0
backend/nginx/temp/temps_are_generated_here +0 -0
backend/requirements-backend.txt +13 -0
backend/routers/__init__.py +0 -0
backend/routers/bgm_separation/__init__.py +0 -0
backend/routers/bgm_separation/models.py +6 -0
backend/routers/bgm_separation/router.py +119 -0
backend/routers/task/__init__.py +0 -0
backend/routers/task/router.py +130 -0
backend/routers/transcription/__init__.py +0 -0
backend/routers/transcription/router.py +123 -0
backend/routers/vad/__init__.py +0 -0
backend/routers/vad/router.py +101 -0
backend/tests/__init__.py +0 -0
backend/tests/test_backend_bgm_separation.py +59 -0
backend/tests/test_backend_config.py +67 -0
backend/tests/test_backend_transcription.py +50 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,10 @@

+# from .gitignore
+modules/yt_tmp.wav
+**/venv/
+**/__pycache__/
+**/outputs/
+**/models/
+**/.idea
+**/.git
+**/.github

.github/FUNDING.yml ADDED Viewed

	@@ -0,0 +1,13 @@

+# These are supported funding model platforms
+github: []
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: jhj0517
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+otechie: # Replace with a single Otechie username
+lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
+custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']

.github/ISSUE_TEMPLATE/bug_report.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: bug
+assignees: jhj0517
+---
+**Which OS are you using?**
+ - OS: [e.g. iOS or Windows.. If you are using Google Colab, just Colab.]

.github/ISSUE_TEMPLATE/feature_request.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+name: Feature request
+about: Any feature you want
+title: ''
+labels: enhancement
+assignees: jhj0517
+---

.github/ISSUE_TEMPLATE/hallucination.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+name: Hallucination
+about: Whisper hallucinations. ( Repeating certain words or subtitles starting too
+  early, etc. )
+title: ''
+labels: hallucination
+assignees: jhj0517
+---
+**Download URL for sample audio**
+- Please upload download URL for sample audio file so I can test with some settings for better result. You can use https://easyupload.io/ or any other service to share.

.github/pull_request_template.md ADDED Viewed

	@@ -0,0 +1,5 @@

+## Related issues / PRs. Summarize issues.
+- #
+## Summarize Changes
+1.

.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,101 @@

+name: CI
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+      - intel-gpu
+  pull_request:
+    branches:
+      - master
+      - intel-gpu
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python: ["3.10", "3.11", "3.12"]
+    env:
+      DEEPL_API_KEY: ${{ secrets.DEEPL_API_KEY }}
+    steps:
+      - name: Clean up space for action
+        run: rm -rf /opt/hostedtoolcache
+      - uses: actions/checkout@v4
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python }}
+      - name: Install git and ffmpeg
+        run: sudo apt-get update && sudo apt-get install -y git ffmpeg
+      - name: Install dependencies
+        run: pip install -r requirements.txt pytest jiwer
+      - name: Run test
+        run: python -m pytest -rs tests
+  test-backend:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python: ["3.10", "3.11", "3.12"]
+    env:
+      DEEPL_API_KEY: ${{ secrets.DEEPL_API_KEY }}
+      TEST_ENV: true
+    steps:
+      - name: Clean up space for action
+        run: rm -rf /opt/hostedtoolcache
+      - uses: actions/checkout@v4
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python }}
+      - name: Install git and ffmpeg
+        run: sudo apt-get update && sudo apt-get install -y git ffmpeg
+      - name: Install dependencies
+        run: pip install -r backend/requirements-backend.txt pytest pytest-asyncio jiwer
+      - name: Run test
+        run: python -m pytest -rs backend/tests
+  test-shell-script:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python: [ "3.10", "3.11", "3.12" ]
+    steps:
+      - name: Clean up space for action
+        run: rm -rf /opt/hostedtoolcache
+      - uses: actions/checkout@v4
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python }}
+      - name: Install git and ffmpeg
+        run: sudo apt-get update && sudo apt-get install -y git ffmpeg
+      - name: Execute Install.sh
+        run: |
+          chmod +x ./Install.sh
+          ./Install.sh
+      - name: Execute start-webui.sh
+        run: |
+          chmod +x ./start-webui.sh
+          timeout 60s ./start-webui.sh || true

.github/workflows/publish-docker.yml ADDED Viewed

	@@ -0,0 +1,73 @@

+name: Publish to Docker Hub
+on:
+  push:
+    branches:
+      - master
+jobs:
+  build-and-push-webui:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clean up space for action
+        run: rm -rf /opt/hostedtoolcache
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: ./Dockerfile
+          push: true
+          tags: ${{ secrets.DOCKER_USERNAME }}/whisper-webui:latest
+      - name: Log out of Docker Hub
+        run: docker logout
+  build-and-push-backend:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clean up space for action
+        run: rm -rf /opt/hostedtoolcache
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: ./backend/Dockerfile
+          push: true
+          tags: ${{ secrets.DOCKER_USERNAME }}/whisper-webui-backend:latest
+      - name: Log out of Docker Hub
+        run: docker logout

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+*.wav
+*.png
+*.mp4
+*.mp3
+**/.env
+**/.idea/
+**/.pytest_cache/
+**/venv/
+**/__pycache__/
+outputs/
+models/
+modules/yt_tmp.wav
+configs/default_parameters.yaml

Dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+FROM debian:bookworm-slim AS builder
+RUN apt-get update && \
+    apt-get install -y curl git python3 python3-pip python3-venv && \
+    rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* && \
+    mkdir -p /Whisper-WebUI
+WORKDIR /Whisper-WebUI
+COPY requirements.txt .
+RUN python3 -m venv venv && \
+    . venv/bin/activate && \
+    pip install -U -r requirements.txt
+FROM debian:bookworm-slim AS runtime
+RUN apt-get update && \
+    apt-get install -y curl ffmpeg python3 && \
+    rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
+WORKDIR /Whisper-WebUI
+COPY . .
+COPY --from=builder /Whisper-WebUI/venv /Whisper-WebUI/venv
+VOLUME [ "/Whisper-WebUI/models" ]
+VOLUME [ "/Whisper-WebUI/outputs" ]
+ENV PATH="/Whisper-WebUI/venv/bin:$PATH"
+ENV LD_LIBRARY_PATH=/Whisper-WebUI/venv/lib64/python3.11/site-packages/nvidia/cublas/lib:/Whisper-WebUI/venv/lib64/python3.11/site-packages/nvidia/cudnn/lib
+ENTRYPOINT [ "python", "app.py" ]

Install.bat ADDED Viewed

	@@ -0,0 +1,21 @@

+@echo off
+if not exist "%~dp0\venv\Scripts" (
+    echo Creating venv...
+    python -m venv venv
+)
+echo checked the venv folder. now installing requirements..
+call "%~dp0\venv\scripts\activate"
+python -m pip install -U pip
+pip install -r requirements.txt
+if errorlevel 1 (
+    echo.
+    echo Requirements installation failed. please remove venv folder and run install.bat again.
+) else (
+    echo.
+    echo Requirements installed successfully.
+)
+pause

Install.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/bin/bash
+if [ ! -d "venv" ]; then
+    echo "Creating virtual environment..."
+    python -m venv venv
+fi
+source venv/bin/activate
+python -m pip install -U pip
+pip install -r requirements.txt && echo "Requirements installed successfully." || {
+    echo ""
+    echo "Requirements installation failed. Please remove the venv folder and run the script again."
+    deactivate
+    exit 1
+}
+deactivate

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2023 jhj0517
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,12 +1,134 @@
----
-title: Whisper WebUI
-emoji: 📈
-colorFrom: green
-colorTo: green
-sdk: gradio
-sdk_version: 5.13.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Whisper-WebUI
+A Gradio-based browser interface for [Whisper](https://github.com/openai/whisper). You can use it as an Easy Subtitle Generator!
+![screen](https://github.com/user-attachments/assets/caea3afd-a73c-40af-a347-8d57914b1d0f)
+## Notebook
+If you wish to try this on Colab, you can do it in [here](https://colab.research.google.com/github/jhj0517/Whisper-WebUI/blob/master/notebook/whisper-webui.ipynb)!
+# Feature
+- Select the Whisper implementation you want to use between :
+   - [openai/whisper](https://github.com/openai/whisper)
+   - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) (used by default)
+   - [Vaibhavs10/insanely-fast-whisper](https://github.com/Vaibhavs10/insanely-fast-whisper)
+- Generate subtitles from various sources, including :
+  - Files
+  - Youtube
+  - Microphone
+- Currently supported subtitle formats :
+  - SRT
+  - WebVTT
+  - txt ( only text file without timeline )
+- Speech to Text Translation
+  - From other languages to English. ( This is Whisper's end-to-end speech-to-text translation feature )
+- Text to Text Translation
+  - Translate subtitle files using Facebook NLLB models
+  - Translate subtitle files using DeepL API
+- Pre-processing audio input with [Silero VAD](https://github.com/snakers4/silero-vad).
+- Pre-processing audio input to separate BGM with [UVR](https://github.com/Anjok07/ultimatevocalremovergui).
+- Post-processing with speaker diarization using the [pyannote](https://huggingface.co/pyannote/speaker-diarization-3.1) model.
+   - To download the pyannote model, you need to have a Huggingface token and manually accept their terms in the pages below.
+      1. https://huggingface.co/pyannote/speaker-diarization-3.1
+      2. https://huggingface.co/pyannote/segmentation-3.0
+### Pipeline Diagram
+![Transcription Pipeline](https://github.com/user-attachments/assets/1d8c63ac-72a4-4a0b-9db0-e03695dcf088)
+# Installation and Running
+- ## Running with Pinokio
+The app is able to run with [Pinokio](https://github.com/pinokiocomputer/pinokio).
+1. Install [Pinokio Software](https://program.pinokio.computer/#/?id=install).
+2. Open the software and search for Whisper-WebUI and install it.
+3. Start the Whisper-WebUI and connect to the `http://localhost:7860`.
+- ## Running with Docker
+1. Install and launch [Docker-Desktop](https://www.docker.com/products/docker-desktop/).
+2. Git clone the repository
+```sh
+git clone https://github.com/jhj0517/Whisper-WebUI.git
+```
+3. Build the image ( Image is about 7GB~ )
+```sh
+docker compose build
+```
+4. Run the container
+```sh
+docker compose up
+```
+5. Connect to the WebUI with your browser at `http://localhost:7860`
+If needed, update the [`docker-compose.yaml`](https://github.com/jhj0517/Whisper-WebUI/blob/master/docker-compose.yaml) to match your environment.
+- ## Run Locally
+### Prerequisite
+To run this WebUI, you need to have `git`, `3.10 <= python <= 3.12`, `FFmpeg`. <br>
+And if you're not using an Nvida GPU, or using a different `CUDA` version than 12.4,  edit the [`requirements.txt`](https://github.com/jhj0517/Whisper-WebUI/blob/master/requirements.txt) to match your environment.
+Please follow the links below to install the necessary software:
+- git : [https://git-scm.com/downloads](https://git-scm.com/downloads)
+- python : [https://www.python.org/downloads/](https://www.python.org/downloads/) **`3.10 ~ 3.12` is recommended.**
+- FFmpeg :  [https://ffmpeg.org/download.html](https://ffmpeg.org/download.html)
+- CUDA : [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads)
+After installing FFmpeg, **make sure to add the `FFmpeg/bin` folder to your system PATH!**
+### Installation Using the Script Files
+1. git clone this repository
+```shell
+git clone https://github.com/jhj0517/Whisper-WebUI.git
+```
+2. Run `install.bat` or `install.sh` to install dependencies. (It will create a `venv` directory and install dependencies there.)
+3. Start WebUI with `start-webui.bat` or `start-webui.sh` (It will run `python app.py` after activating the venv)
+And you can also run the project with command line arguments if you like to, see [wiki](https://github.com/jhj0517/Whisper-WebUI/wiki/Command-Line-Arguments) for a guide to arguments.
+# VRAM Usages
+This project is integrated with [faster-whisper](https://github.com/guillaumekln/faster-whisper) by default for better VRAM usage and transcription speed.
+According to faster-whisper, the efficiency of the optimized whisper model is as follows:
+| Implementation    | Precision | Beam size | Time  | Max. GPU memory | Max. CPU memory |
+|-------------------|-----------|-----------|-------|-----------------|-----------------|
+| openai/whisper    | fp16      | 5         | 4m30s | 11325MB         | 9439MB          |
+| faster-whisper    | fp16      | 5         | 54s   | 4755MB          | 3244MB          |
+If you want to use an implementation other than faster-whisper, use `--whisper_type` arg and the repository name.<br>
+Read [wiki](https://github.com/jhj0517/Whisper-WebUI/wiki/Command-Line-Arguments) for more info about CLI args.
+If you want to use a fine-tuned model, manually place the models in `models/Whisper/` corresponding to the implementation.
+Alternatively, if you enter the huggingface repo id (e.g, [deepdml/faster-whisper-large-v3-turbo-ct2](https://huggingface.co/deepdml/faster-whisper-large-v3-turbo-ct2)) in the "Model" dropdown, it will be automatically downloaded in the directory.
+![image](https://github.com/user-attachments/assets/76487a46-b0a5-4154-b735-ded73b2d83d4)
+# REST API
+If you're interested in deploying this app as a REST API, please check out [/backend](https://github.com/jhj0517/Whisper-WebUI/tree/master/backend).
+## TODO🗓
+- [x] Add DeepL API translation
+- [x] Add NLLB Model translation
+- [x] Integrate with faster-whisper
+- [x] Integrate with insanely-fast-whisper
+- [x] Integrate with whisperX ( Only speaker diarization part )
+- [x] Add background music separation pre-processing with [UVR](https://github.com/Anjok07/ultimatevocalremovergui)
+- [x] Add fast api script
+- [ ] Add CLI usages
+- [ ] Support real-time transcription for microphone
+### Translation 🌐
+Any PRs that translate the language into [translation.yaml](https://github.com/jhj0517/Whisper-WebUI/blob/master/configs/translation.yaml) would be greatly appreciated!

app.py ADDED Viewed

	@@ -0,0 +1,368 @@

+import os
+import argparse
+import gradio as gr
+from gradio_i18n import Translate, gettext as _
+import yaml
+from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
+                                 INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
+                                 UVR_MODELS_DIR, I18N_YAML_PATH)
+from modules.utils.files_manager import load_yaml, MEDIA_EXTENSION
+from modules.whisper.whisper_factory import WhisperFactory
+from modules.translation.nllb_inference import NLLBInference
+from modules.ui.htmls import *
+from modules.utils.cli_manager import str2bool
+from modules.utils.youtube_manager import get_ytmetas
+from modules.translation.deepl_api import DeepLAPI
+from modules.whisper.data_classes import *
+class App:
+    def __init__(self, args):
+        self.args = args
+        self.app = gr.Blocks(css=CSS, theme=self.args.theme, delete_cache=(60, 3600))
+        self.whisper_inf = WhisperFactory.create_whisper_inference(
+            whisper_type=self.args.whisper_type,
+            whisper_model_dir=self.args.whisper_model_dir,
+            faster_whisper_model_dir=self.args.faster_whisper_model_dir,
+            insanely_fast_whisper_model_dir=self.args.insanely_fast_whisper_model_dir,
+            uvr_model_dir=self.args.uvr_model_dir,
+            output_dir=self.args.output_dir,
+        )
+        self.nllb_inf = NLLBInference(
+            model_dir=self.args.nllb_model_dir,
+            output_dir=os.path.join(self.args.output_dir, "translations")
+        )
+        self.deepl_api = DeepLAPI(
+            output_dir=os.path.join(self.args.output_dir, "translations")
+        )
+        self.i18n = load_yaml(I18N_YAML_PATH)
+        self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
+        print(f"Use \"{self.args.whisper_type}\" implementation\n"
+              f"Device \"{self.whisper_inf.device}\" is detected")
+    def create_pipeline_inputs(self):
+        whisper_params = self.default_params["whisper"]
+        vad_params = self.default_params["vad"]
+        diarization_params = self.default_params["diarization"]
+        uvr_params = self.default_params["bgm_separation"]
+        with gr.Row():
+            dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
+                                   label=_("Model"), allow_custom_value=True)
+            dd_lang = gr.Dropdown(choices=self.whisper_inf.available_langs + [AUTOMATIC_DETECTION],
+                                  value=AUTOMATIC_DETECTION if whisper_params["lang"] == AUTOMATIC_DETECTION.unwrap()
+                                  else whisper_params["lang"], label=_("Language"))
+            dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt", "LRC"], value=whisper_params["file_format"], label=_("File Format"))
+        with gr.Row():
+            cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label=_("Translate to English?"),
+                                       interactive=True)
+        with gr.Row():
+            cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"],
+                                       label=_("Add a timestamp to the end of the filename"),
+                                       interactive=True)
+        with gr.Accordion(_("Advanced Parameters"), open=False):
+            whisper_inputs = WhisperParams.to_gradio_inputs(defaults=whisper_params, only_advanced=True,
+                                                            whisper_type=self.args.whisper_type,
+                                                            available_compute_types=self.whisper_inf.available_compute_types,
+                                                            compute_type=self.whisper_inf.current_compute_type)
+        with gr.Accordion(_("Background Music Remover Filter"), open=False):
+            uvr_inputs = BGMSeparationParams.to_gradio_input(defaults=uvr_params,
+                                                             available_models=self.whisper_inf.music_separator.available_models,
+                                                             available_devices=self.whisper_inf.music_separator.available_devices,
+                                                             device=self.whisper_inf.music_separator.device)
+        with gr.Accordion(_("Voice Detection Filter"), open=False):
+            vad_inputs = VadParams.to_gradio_inputs(defaults=vad_params)
+        with gr.Accordion(_("Diarization"), open=False):
+            diarization_inputs = DiarizationParams.to_gradio_inputs(defaults=diarization_params,
+                                                                    available_devices=self.whisper_inf.diarizer.available_device,
+                                                                    device=self.whisper_inf.diarizer.device)
+        pipeline_inputs = [dd_model, dd_lang, cb_translate] + whisper_inputs + vad_inputs + diarization_inputs + uvr_inputs
+        return (
+            pipeline_inputs,
+            dd_file_format,
+            cb_timestamp
+        )
+    def launch(self):
+        translation_params = self.default_params["translation"]
+        deepl_params = translation_params["deepl"]
+        nllb_params = translation_params["nllb"]
+        uvr_params = self.default_params["bgm_separation"]
+        with self.app:
+            lang = gr.Radio(choices=list(self.i18n.keys()),
+                            label=_("Language"), interactive=True,
+                            visible=False,  # Set it by development purpose.
+                            )
+            with Translate(I18N_YAML_PATH):
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown(MARKDOWN, elem_id="md_project")
+                with gr.Tabs():
+                    with gr.TabItem(_("File")):  # tab1
+                        with gr.Column():
+                            input_file = gr.Files(type="filepath", label=_("Upload File here"), file_types=MEDIA_EXTENSION)
+                            tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)",
+                                                         info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them."
+                                                              " Leave this field empty if you do not wish to use a local path.",
+                                                         visible=self.args.colab,
+                                                         value="")
+                            cb_include_subdirectory = gr.Checkbox(label="Include Subdirectory Files",
+                                                                  info="When using Input Folder Path above, whether to include all files in the subdirectory or not.",
+                                                                  visible=self.args.colab,
+                                                                  value=False)
+                            cb_save_same_dir = gr.Checkbox(label="Save outputs at same directory",
+                                                           info="When using Input Folder Path above, whether to save output in the same directory as inputs or not, in addition to the original"
+                                                                " output directory.",
+                                                           visible=self.args.colab,
+                                                           value=True)
+                        pipeline_params, dd_file_format, cb_timestamp = self.create_pipeline_inputs()
+                        with gr.Row():
+                            btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
+                        with gr.Row():
+                            tb_indicator = gr.Textbox(label=_("Output"), scale=5)
+                            files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3, interactive=False)
+                            btn_openfolder = gr.Button('📂', scale=1)
+                        params = [input_file, tb_input_folder, cb_include_subdirectory, cb_save_same_dir,
+                                  dd_file_format, cb_timestamp]
+                        params = params + pipeline_params
+                        btn_run.click(fn=self.whisper_inf.transcribe_file,
+                                      inputs=params,
+                                      outputs=[tb_indicator, files_subtitles])
+                        btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
+                    with gr.TabItem(_("Youtube")):  # tab2
+                        with gr.Row():
+                            tb_youtubelink = gr.Textbox(label=_("Youtube Link"))
+                        with gr.Row(equal_height=True):
+                            with gr.Column():
+                                img_thumbnail = gr.Image(label=_("Youtube Thumbnail"))
+                            with gr.Column():
+                                tb_title = gr.Label(label=_("Youtube Title"))
+                                tb_description = gr.Textbox(label=_("Youtube Description"), max_lines=15)
+                        pipeline_params, dd_file_format, cb_timestamp = self.create_pipeline_inputs()
+                        with gr.Row():
+                            btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
+                        with gr.Row():
+                            tb_indicator = gr.Textbox(label=_("Output"), scale=5)
+                            files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
+                            btn_openfolder = gr.Button('📂', scale=1)
+                        params = [tb_youtubelink, dd_file_format, cb_timestamp]
+                        btn_run.click(fn=self.whisper_inf.transcribe_youtube,
+                                      inputs=params + pipeline_params,
+                                      outputs=[tb_indicator, files_subtitles])
+                        tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
+                                              outputs=[img_thumbnail, tb_title, tb_description])
+                        btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
+                    with gr.TabItem(_("Mic")):  # tab3
+                        with gr.Row():
+                            mic_input = gr.Microphone(label=_("Record with Mic"), type="filepath", interactive=True,
+                                                      show_download_button=True)
+                        pipeline_params, dd_file_format, cb_timestamp = self.create_pipeline_inputs()
+                        with gr.Row():
+                            btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
+                        with gr.Row():
+                            tb_indicator = gr.Textbox(label=_("Output"), scale=5)
+                            files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
+                            btn_openfolder = gr.Button('📂', scale=1)
+                        params = [mic_input, dd_file_format, cb_timestamp]
+                        btn_run.click(fn=self.whisper_inf.transcribe_mic,
+                                      inputs=params + pipeline_params,
+                                      outputs=[tb_indicator, files_subtitles])
+                        btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
+                    with gr.TabItem(_("T2T Translation")):  # tab 4
+                        with gr.Row():
+                            file_subs = gr.Files(type="filepath", label=_("Upload Subtitle Files to translate here"))
+                        with gr.TabItem(_("DeepL API")):  # sub tab1
+                            with gr.Row():
+                                tb_api_key = gr.Textbox(label=_("Your Auth Key (API KEY)"),
+                                                        value=deepl_params["api_key"])
+                            with gr.Row():
+                                dd_source_lang = gr.Dropdown(label=_("Source Language"),
+                                                             value=AUTOMATIC_DETECTION if deepl_params["source_lang"] == AUTOMATIC_DETECTION.unwrap()
+                                                             else deepl_params["source_lang"],
+                                                             choices=list(self.deepl_api.available_source_langs.keys()))
+                                dd_target_lang = gr.Dropdown(label=_("Target Language"),
+                                                             value=deepl_params["target_lang"],
+                                                             choices=list(self.deepl_api.available_target_langs.keys()))
+                            with gr.Row():
+                                cb_is_pro = gr.Checkbox(label=_("Pro User?"), value=deepl_params["is_pro"])
+                            with gr.Row():
+                                cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"],
+                                                           label=_("Add a timestamp to the end of the filename"),
+                                                           interactive=True)
+                            with gr.Row():
+                                btn_run = gr.Button(_("TRANSLATE SUBTITLE FILE"), variant="primary")
+                            with gr.Row():
+                                tb_indicator = gr.Textbox(label=_("Output"), scale=5)
+                                files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
+                                btn_openfolder = gr.Button('📂', scale=1)
+                        btn_run.click(fn=self.deepl_api.translate_deepl,
+                                      inputs=[tb_api_key, file_subs, dd_source_lang, dd_target_lang,
+                                              cb_is_pro, cb_timestamp],
+                                      outputs=[tb_indicator, files_subtitles])
+                        btn_openfolder.click(
+                            fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
+                            inputs=None,
+                            outputs=None)
+                        with gr.TabItem(_("NLLB")):  # sub tab2
+                            with gr.Row():
+                                dd_model_size = gr.Dropdown(label=_("Model"), value=nllb_params["model_size"],
+                                                            choices=self.nllb_inf.available_models)
+                                dd_source_lang = gr.Dropdown(label=_("Source Language"),
+                                                             value=nllb_params["source_lang"],
+                                                             choices=self.nllb_inf.available_source_langs)
+                                dd_target_lang = gr.Dropdown(label=_("Target Language"),
+                                                             value=nllb_params["target_lang"],
+                                                             choices=self.nllb_inf.available_target_langs)
+                            with gr.Row():
+                                nb_max_length = gr.Number(label="Max Length Per Line", value=nllb_params["max_length"],
+                                                          precision=0)
+                            with gr.Row():
+                                cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"],
+                                                           label=_("Add a timestamp to the end of the filename"),
+                                                           interactive=True)
+                            with gr.Row():
+                                btn_run = gr.Button(_("TRANSLATE SUBTITLE FILE"), variant="primary")
+                            with gr.Row():
+                                tb_indicator = gr.Textbox(label=_("Output"), scale=5)
+                                files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
+                                btn_openfolder = gr.Button('📂', scale=1)
+                            with gr.Column():
+                                md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
+                        btn_run.click(fn=self.nllb_inf.translate_file,
+                                      inputs=[file_subs, dd_model_size, dd_source_lang, dd_target_lang,
+                                              nb_max_length, cb_timestamp],
+                                      outputs=[tb_indicator, files_subtitles])
+                        btn_openfolder.click(
+                            fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
+                            inputs=None,
+                            outputs=None)
+                    with gr.TabItem(_("BGM Separation")):
+                        files_audio = gr.Files(type="filepath", label=_("Upload Audio Files to separate background music"))
+                        dd_uvr_device = gr.Dropdown(label=_("Device"), value=self.whisper_inf.music_separator.device,
+                                                    choices=self.whisper_inf.music_separator.available_devices)
+                        dd_uvr_model_size = gr.Dropdown(label=_("Model"), value=uvr_params["uvr_model_size"],
+                                                        choices=self.whisper_inf.music_separator.available_models)
+                        nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"],
+                                                        precision=0)
+                        cb_uvr_save_file = gr.Checkbox(label=_("Save separated files to output"),
+                                                       value=True, visible=False)
+                        btn_run = gr.Button(_("SEPARATE BACKGROUND MUSIC"), variant="primary")
+                        with gr.Column():
+                            with gr.Row():
+                                ad_instrumental = gr.Audio(label=_("Instrumental"), scale=8)
+                                btn_open_instrumental_folder = gr.Button('📂', scale=1)
+                            with gr.Row():
+                                ad_vocals = gr.Audio(label=_("Vocals"), scale=8)
+                                btn_open_vocals_folder = gr.Button('📂', scale=1)
+                        btn_run.click(fn=self.whisper_inf.music_separator.separate_files,
+                                      inputs=[files_audio, dd_uvr_model_size, dd_uvr_device, nb_uvr_segment_size,
+                                              cb_uvr_save_file],
+                                      outputs=[ad_instrumental, ad_vocals])
+                        btn_open_instrumental_folder.click(inputs=None,
+                                                           outputs=None,
+                                                           fn=lambda: self.open_folder(os.path.join(
+                                                               self.args.output_dir, "UVR", "instrumental"
+                                                           )))
+                        btn_open_vocals_folder.click(inputs=None,
+                                                     outputs=None,
+                                                     fn=lambda: self.open_folder(os.path.join(
+                                                         self.args.output_dir, "UVR", "vocals"
+                                                     )))
+        # Launch the app with optional gradio settings
+        args = self.args
+        self.app.queue(
+            api_open=args.api_open
+        ).launch(
+            share=args.share,
+            server_name=args.server_name,
+            server_port=args.server_port,
+            auth=(args.username, args.password) if args.username and args.password else None,
+            root_path=args.root_path,
+            inbrowser=args.inbrowser,
+            ssl_verify=args.ssl_verify,
+            ssl_keyfile=args.ssl_keyfile,
+            ssl_keyfile_password=args.ssl_keyfile_password,
+            ssl_certfile=args.ssl_certfile,
+            allowed_paths=eval(args.allowed_paths) if args.allowed_paths else None
+        )
+    @staticmethod
+    def open_folder(folder_path: str):
+        if os.path.exists(folder_path):
+            os.system(f"start {folder_path}")
+        else:
+            os.makedirs(folder_path, exist_ok=True)
+            print(f"The directory path {folder_path} has newly created.")
+parser = argparse.ArgumentParser()
+parser.add_argument('--whisper_type', type=str, default=WhisperImpl.FASTER_WHISPER.value,
+                    choices=[item.value for item in WhisperImpl],
+                    help='A type of the whisper implementation (Github repo name)')
+parser.add_argument('--share', type=str2bool, default=False, nargs='?', const=True, help='Gradio share value')
+parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
+parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
+parser.add_argument('--root_path', type=str, default=None, help='Gradio root path')
+parser.add_argument('--username', type=str, default=None, help='Gradio authentication username')
+parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
+parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
+parser.add_argument('--colab', type=str2bool, default=False, nargs='?', const=True, help='Is colab user or not')
+parser.add_argument('--api_open', type=str2bool, default=False, nargs='?', const=True,
+                    help='Enable api or not in Gradio')
+parser.add_argument('--allowed_paths', type=str, default=None, help='Gradio allowed paths')
+parser.add_argument('--inbrowser', type=str2bool, default=True, nargs='?', const=True,
+                    help='Whether to automatically start Gradio app or not')
+parser.add_argument('--ssl_verify', type=str2bool, default=True, nargs='?', const=True,
+                    help='Whether to verify SSL or not')
+parser.add_argument('--ssl_keyfile', type=str, default=None, help='SSL Key file location')
+parser.add_argument('--ssl_keyfile_password', type=str, default=None, help='SSL Key file password')
+parser.add_argument('--ssl_certfile', type=str, default=None, help='SSL cert file location')
+parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
+                    help='Directory path of the whisper model')
+parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,
+                    help='Directory path of the faster-whisper model')
+parser.add_argument('--insanely_fast_whisper_model_dir', type=str,
+                    default=INSANELY_FAST_WHISPER_MODELS_DIR,
+                    help='Directory path of the insanely-fast-whisper model')
+parser.add_argument('--diarization_model_dir', type=str, default=DIARIZATION_MODELS_DIR,
+                    help='Directory path of the diarization model')
+parser.add_argument('--nllb_model_dir', type=str, default=NLLB_MODELS_DIR,
+                    help='Directory path of the Facebook NLLB model')
+parser.add_argument('--uvr_model_dir', type=str, default=UVR_MODELS_DIR,
+                    help='Directory path of the UVR model')
+parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Directory path of the outputs')
+_args = parser.parse_args()
+if __name__ == "__main__":
+    app = App(args=_args)
+    app.launch()

backend/Dockerfile ADDED Viewed

	@@ -0,0 +1,36 @@

+FROM debian:bookworm-slim AS builder
+RUN apt-get update && \
+    apt-get install -y curl git python3 python3-pip python3-venv && \
+    rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* && \
+    mkdir -p /Whisper-WebUI
+WORKDIR /Whisper-WebUI
+COPY backend/ backend/
+COPY requirements.txt requirements.txt
+RUN python3 -m venv venv && \
+    . venv/bin/activate && \
+    pip install -U -r backend/requirements-backend.txt
+FROM debian:bookworm-slim AS runtime
+RUN apt-get update && \
+    apt-get install -y curl ffmpeg python3 && \
+    rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
+WORKDIR /Whisper-WebUI
+COPY . .
+COPY --from=builder /Whisper-WebUI/venv /Whisper-WebUI/venv
+VOLUME [ "/Whisper-WebUI/models" ]
+VOLUME [ "/Whisper-WebUI/outputs" ]
+VOLUME [ "/Whisper-WebUI/backend" ]
+ENV PATH="/Whisper-WebUI/venv/bin:$PATH"
+ENV LD_LIBRARY_PATH=/Whisper-WebUI/venv/lib64/python3.11/site-packages/nvidia/cublas/lib:/Whisper-WebUI/venv/lib64/python3.11/site-packages/nvidia/cudnn/lib
+ENTRYPOINT ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "8000"]

backend/README.md ADDED Viewed

	@@ -0,0 +1,110 @@

+# Whisper-WebUI REST API
+REST API for Whisper-WebUI. Documentation is auto-generated upon deploying the app.
+<br>[Swagger UI](https://github.com/swagger-api/swagger-ui) is available at `app/docs` or root URL with redirection. [Redoc](https://github.com/Redocly/redoc) is available at `app/redoc`.
+# Setup and Installation
+Installation assumes that you are in the root directory of Whisper-WebUI
+1. Create `.env` in `backend/configs/.env`
+```
+HF_TOKEN="YOUR_HF_TOKEN FOR DIARIZATION MODEL (READ PERMISSION)"
+DB_URL="sqlite:///backend/records.db"
+```
+`HF_TOKEN` is used to download diarization model, `DB_URL` indicates where your db file is located. It is stored in `backend/` by default.
+2. Install dependency
+```
+pip install -r backend/requirements-backend.txt
+```
+3. Deploy the server with `uvicorn` or whatever.
+```
+uvicorn backend.main:app --host 0.0.0.0 --port 8000
+```
+### Deploy with your domain name
+You can deploy the server with your domain name by setting up a reverse proxy with Nginx.
+1. Install Nginx if you don't already have it.
+- Linux : https://nginx.org/en/docs/install.html
+- Windows : https://nginx.org/en/docs/windows.html
+2. Edit [`nginx.conf`](https://github.com/jhj0517/Whisper-WebUI/blob/master/backend/nginx/nginx.conf) for your domain name.
+https://github.com/jhj0517/Whisper-WebUI/blob/895cafe400944396ad8be5b1cc793b54fecc8bbe/backend/nginx/nginx.conf#L12
+3. Add an A type record of your public IPv4 address in your domain provider. (you can get it by searching "What is my IP" in Google)
+4. Open a terminal and go to the location of [`nginx.conf`](https://github.com/jhj0517/Whisper-WebUI/blob/master/backend/nginx/nginx.conf), then start the nginx server, so that you can manage nginx-related logs there.
+```shell
+cd backend/nginx
+nginx -c "/path/to/Whisper-WebUI/backend/nginx/nginx.conf"
+```
+5. Open another terminal in the root project location `/Whisper-WebUI`, and deploy the app with `uvicorn` or whatever. Now the app will be available at your domain.
+```shell
+uvicorn backend.main:app --host 0.0.0.0 --port 8000
+```
+6. When you turn off nginx, you can use `nginx -s stop`.
+```shell
+cd backend/nginx
+nginx -s stop -c "/path/to/Whisper-WebUI/backend/nginx/nginx.conf"
+```
+## Configuration
+You can set some server configurations in [config.yaml](https://github.com/jhj0517/Whisper-WebUI/blob/master/backend/configs/config.yaml).
+<br>For example, initial model size for Whisper or the cleanup frequency and TTL for cached files.
+<br>If the endpoint generates and saves the file, all output files are stored in the `cache` directory, e.g. separated vocal/instrument files for `/bgm-separation` are saved in `cache` directory.
+## Docker
+The Dockerfile should be built when you're in the root directory of Whisper-WebUI.
+1. git clone this repository
+```
+git clone https://github.com/jhj0517/Whisper-WebUI.git
+```
+2. Mount volume paths with your local paths in `docker-compose.yaml`
+https://github.com/jhj0517/Whisper-WebUI/blob/1dd708ec3844dbf0c1f77de9ef5764e883dd4c78/backend/docker-compose.yaml#L12-L15
+3. Build the image
+```
+docker compose -f backend/docker-compose.yaml build
+```
+4. Run the container
+```
+docker compose -f backend/docker-compose.yaml up
+```
+5. Then you can read docs at `localhost:8000` (default port is set to `8000` in `docker-compose.yaml`) and run your own tests.
+# Architecture
+![diagram](https://github.com/user-attachments/assets/37d2ab2d-4eb4-4513-bb7b-027d0d631971)
+The response can be obtained through [the polling API](https://docs.oracle.com/en/cloud/saas/marketing/responsys-develop/API/REST/Async/asyncApi-v1.3-requests-requestId-get.htm).
+Each task is stored in the DB whenever the task is queued or updated by the process.
+When the client first sends the `POST` request, the server returns an `identifier` to the client that can be used to track the status of the task. The task status is updated by the processes, and once the task is completed,  the client can finally obtain the result.
+The client needs to implement manual API polling to do this, this is the example for the python client:
+```python
+def wait_for_task_completion(identifier: str,
+                             max_attempts: int = 20,
+                             frequency: int = 3) -> httpx.Response:
+    """
+    Polls the task status every `frequency` until it is completed, failed, or the `max_attempts` are reached.
+    """
+    attempts = 0
+    while attempts < max_attempts:
+        task = fetch_task(identifier)
+        status = task.json()["status"]
+        if status == "COMPLETED":
+            return task["result"]
+        if status == "FAILED":
+            raise Exception("Task polling failed")
+        time.sleep(frequency)
+        attempts += 1
+    return None
+```

backend/__init__.py ADDED Viewed

File without changes

backend/cache/cached_files_are_generated_here ADDED Viewed

File without changes

backend/common/audio.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from io import BytesIO
+import numpy as np
+import httpx
+import faster_whisper
+from pydantic import BaseModel
+from fastapi import (
+    HTTPException,
+    UploadFile,
+)
+from typing import Annotated, Any, BinaryIO, Literal, Generator, Union, Optional, List, Tuple
+class AudioInfo(BaseModel):
+    duration: float
+async def read_audio(
+    file: Optional[UploadFile] = None,
+    file_url: Optional[str] = None
+):
+    """Read audio from "UploadFile". This resamples sampling rates to 16000."""
+    if (file and file_url) or (not file and not file_url):
+        raise HTTPException(status_code=400, detail="Provide only one of file or file_url")
+    if file:
+        file_content = await file.read()
+    elif file_url:
+        async with httpx.AsyncClient() as client:
+            file_response = await client.get(file_url)
+        if file_response.status_code != 200:
+            raise HTTPException(status_code=422, detail="Could not download the file")
+        file_content = file_response.content
+    file_bytes = BytesIO(file_content)
+    audio = faster_whisper.audio.decode_audio(file_bytes)
+    duration = len(audio) / 16000
+    return audio, AudioInfo(duration=duration)

backend/common/cache_manager.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import time
+import os
+from typing import Optional
+from modules.utils.paths import BACKEND_CACHE_DIR
+def cleanup_old_files(cache_dir: str = BACKEND_CACHE_DIR, ttl: int = 60):
+    now = time.time()
+    place_holder_name = "cached_files_are_generated_here"
+    for root, dirs, files in os.walk(cache_dir):
+        for filename in files:
+            if filename == place_holder_name:
+                continue
+            filepath = os.path.join(root, filename)
+            if now - os.path.getmtime(filepath) > ttl:
+                try:
+                    os.remove(filepath)
+                except Exception as e:
+                    print(f"Error removing {filepath}")
+                    raise

backend/common/compresser.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import os
+import zipfile
+from typing import List, Optional
+import hashlib
+def compress_files(file_paths: List[str], output_zip_path: str) -> str:
+    """
+    Compress multiple files into a single zip file.
+    Args:
+    file_paths (List[str]): List of paths to files to be compressed.
+    output_zip (str): Path and name of the output zip file.
+    Raises:
+    FileNotFoundError: If any of the input files doesn't exist.
+    """
+    os.makedirs(os.path.dirname(output_zip_path), exist_ok=True)
+    compression = zipfile.ZIP_DEFLATED
+    with zipfile.ZipFile(output_zip_path, 'w', compression=compression) as zipf:
+        for file_path in file_paths:
+            if not os.path.exists(file_path):
+                raise FileNotFoundError(f"File not found: {file_path}")
+            file_name = os.path.basename(file_path)
+            zipf.write(file_path, file_name)
+    return output_zip_path
+def get_file_hash(file_path: str) -> str:
+    """Generate the hash of a file using the specified hashing algorithm. It generates hash by content not path. """
+    hash_func = hashlib.new("sha256")
+    try:
+        with open(file_path, 'rb') as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                hash_func.update(chunk)
+        return hash_func.hexdigest()
+    except FileNotFoundError:
+        return f"File not found: {file_path}"
+    except Exception as e:
+        return f"An error occurred: {str(e)}"
+def find_file_by_hash(dir_path: str, hash_str: str) -> Optional[str]:
+    """Get file path from the directory based on its hash"""
+    if not os.path.exists(dir_path) and os.path.isdir(dir_path):
+        raise ValueError(f"Directory {dir_path} does not exist")
+    files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f))]
+    for f in files:
+        f_hash = get_file_hash(f)
+        if hash_str == f_hash:
+            return f
+    return None

backend/common/config_loader.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from dotenv import load_dotenv
+import os
+from modules.utils.paths import SERVER_CONFIG_PATH, SERVER_DOTENV_PATH
+from modules.utils.files_manager import load_yaml, save_yaml
+import functools
+@functools.lru_cache
+def load_server_config(config_path: str = SERVER_CONFIG_PATH) -> dict:
+    if os.getenv("TEST_ENV", "false").lower() == "true":
+        server_config = load_yaml(config_path)
+        server_config["whisper"]["model_size"] = "tiny"
+        server_config["whisper"]["compute_type"] = "float32"
+        save_yaml(server_config, config_path)
+    return load_yaml(config_path)
+@functools.lru_cache
+def read_env(key: str, default: str = None, dotenv_path: str = SERVER_DOTENV_PATH):
+    load_dotenv(dotenv_path)
+    value = os.getenv(key, default)
+    return value

backend/common/models.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from pydantic import BaseModel, Field, validator
+from typing import List, Any, Optional
+from backend.db.task.models import TaskStatus, ResultType, TaskType
+class QueueResponse(BaseModel):
+    identifier: str = Field(..., description="Unique identifier for the queued task that can be used for tracking")
+    status: TaskStatus = Field(..., description="Current status of the task")
+    message: str = Field(..., description="Message providing additional information about the task")
+class Response(BaseModel):
+    identifier: str
+    message: str

backend/configs/config.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+whisper:
+  # Default implementation is faster-whisper. This indicates model name within `models\Whisper\faster-whisper`
+  model_size: large-v2
+  # Compute type. 'float16' for CUDA, 'float32' for CPU.
+  compute_type: float16
+bgm_separation:
+  # UVR model sizes between ["UVR-MDX-NET-Inst_HQ_4", "UVR-MDX-NET-Inst_3"]
+  model_size: UVR-MDX-NET-Inst_HQ_4
+  # Whether to offload the model after the inference. Should be true if your setup has a VRAM less than <16GB
+  enable_offload: true
+  # Device to load BGM separation model
+  device: cuda
+# Settings that apply to the `cache' directory. The output files for `/bgm-separation` are stored in the `cache' directory,
+# (You can check out the actual generated files by testing `/bgm-separation`.)
+# You can adjust the TTL/cleanup frequency of the files in the `cache' directory here.
+cache:
+  # TTL (Time-To-Live) in seconds, defaults to 10 minutes
+  ttl: 600
+  # Clean up frequency in seconds, defaults to 1 minutes
+  frequency: 60

backend/db/__init__.py ADDED Viewed

File without changes

backend/db/db_instance.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import functools
+import os
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+from functools import wraps
+from sqlalchemy.exc import SQLAlchemyError
+from fastapi import HTTPException
+from sqlmodel import SQLModel
+from dotenv import load_dotenv
+from backend.common.config_loader import read_env
+@functools.lru_cache
+def init_db():
+    db_url = read_env("DB_URL", "sqlite:///backend/records.db")
+    engine = create_engine(db_url, connect_args={"check_same_thread": False})
+    SQLModel.metadata.create_all(engine)
+    return sessionmaker(autocommit=False, autoflush=False, bind=engine)
+def get_db_session():
+    db_instance = init_db()
+    return db_instance()
+def handle_database_errors(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        session = None
+        try:
+            session = get_db_session()
+            kwargs['session'] = session
+            return func(*args, **kwargs)
+        except Exception as e:
+            print(f"Database error has occurred: {e}")
+            raise
+        finally:
+            if session:
+                session.close()
+    return wrapper

backend/db/task/__init__.py ADDED Viewed

File without changes

backend/db/task/dao.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from typing import Dict, Any
+from sqlalchemy.orm import Session
+from fastapi import Depends
+from ..db_instance import handle_database_errors, get_db_session
+from .models import Task, TasksResult, TaskStatus
+@handle_database_errors
+def add_task_to_db(
+    session,
+    status=TaskStatus.QUEUED,
+    task_type=None,
+    language=None,
+    task_params=None,
+    file_name=None,
+    url=None,
+    audio_duration=None,
+):
+    """
+    Add task to the db
+    """
+    task = Task(
+        status=status,
+        language=language,
+        file_name=file_name,
+        url=url,
+        task_type=task_type,
+        task_params=task_params,
+        audio_duration=audio_duration,
+    )
+    session.add(task)
+    session.commit()
+    return task.uuid
+@handle_database_errors
+def update_task_status_in_db(
+    identifier: str,
+    update_data: Dict[str, Any],
+    session: Session,
+):
+    """
+    Update task status and attributes in the database.
+    Args:
+        identifier (str): Identifier of the task to be updated.
+        update_data (Dict[str, Any]): Dictionary containing the attributes to update along with their new values.
+        session (Session, optional): Database session. Defaults to Depends(get_db_session).
+    Returns:
+        None
+    """
+    task = session.query(Task).filter_by(uuid=identifier).first()
+    if task:
+        for key, value in update_data.items():
+            setattr(task, key, value)
+        session.commit()
+@handle_database_errors
+def get_task_status_from_db(
+    identifier: str, session: Session
+):
+    """Retrieve task status from db"""
+    task = session.query(Task).filter(Task.uuid == identifier).first()
+    if task:
+        return task
+    else:
+        return None
+@handle_database_errors
+def get_all_tasks_status_from_db(session: Session):
+    """Get all tasks from db"""
+    columns = [Task.uuid, Task.status, Task.task_type]
+    query = session.query(*columns)
+    tasks = [task for task in query]
+    return TasksResult(tasks=tasks)
+@handle_database_errors
+def delete_task_from_db(identifier: str, session: Session):
+    """Delete task from db"""
+    task = session.query(Task).filter(Task.uuid == identifier).first()
+    if task:
+        # If the task exists, delete it from the database
+        session.delete(task)
+        session.commit()
+        return True
+    else:
+        # If the task does not exist, return False
+        return False

backend/db/task/models.py ADDED Viewed

	@@ -0,0 +1,174 @@

+# Ported from https://github.com/pavelzbornik/whisperX-FastAPI/blob/main/app/models.py
+from enum import Enum
+from pydantic import BaseModel
+from typing import Optional, List
+from uuid import uuid4
+from datetime import datetime
+from sqlalchemy.types import Enum as SQLAlchemyEnum
+from typing import Any
+from sqlmodel import SQLModel, Field, JSON, Column
+class ResultType(str, Enum):
+    JSON = "json"
+    FILEPATH = "filepath"
+class TaskStatus(str, Enum):
+    PENDING = "pending"
+    IN_PROGRESS = "in_progress"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+    QUEUED = "queued"
+    PAUSED = "paused"
+    RETRYING = "retrying"
+    def __str__(self):
+        return self.value
+class TaskType(str, Enum):
+    TRANSCRIPTION = "transcription"
+    VAD = "vad"
+    BGM_SEPARATION = "bgm_separation"
+    def __str__(self):
+        return self.value
+class TaskStatusResponse(BaseModel):
+    """`TaskStatusResponse` is a wrapper class that hides sensitive information from `Task`"""
+    identifier: str = Field(..., description="Unique identifier for the queued task that can be used for tracking")
+    status: TaskStatus = Field(..., description="Current status of the task")
+    task_type: Optional[TaskType] = Field(
+        default=None,
+        description="Type/category of the task"
+    )
+    result_type: Optional[ResultType] = Field(
+        default=ResultType.JSON,
+        description="Result type whether it's a filepath or JSON"
+    )
+    result: Optional[Any] = Field(
+        default=None,
+        description="JSON data representing the result of the task"
+    )
+    task_params: Optional[dict] = Field(
+        default=None,
+        description="Parameters of the task"
+    )
+    error: Optional[str] = Field(
+        default=None,
+        description="Error message, if any, associated with the task"
+    )
+    duration: Optional[float] = Field(
+        default=None,
+        description="Duration of the task execution"
+    )
+class Task(SQLModel, table=True):
+    """
+    Table to store tasks information.
+    Attributes:
+    - id: Unique identifier for each task (Primary Key).
+    - uuid: Universally unique identifier for each task.
+    - status: Current status of the task.
+    - result: JSON data representing the result of the task.
+    - result_type: Type of the data whether it is normal JSON data or filepath.
+    - file_name: Name of the file associated with the task.
+    - task_type: Type/category of the task.
+    - duration: Duration of the task execution.
+    - error: Error message, if any, associated with the task.
+    - created_at: Date and time of creation.
+    - updated_at: Date and time of last update.
+    """
+    __tablename__ = "tasks"
+    id: Optional[int] = Field(
+        default=None,
+        primary_key=True,
+        description="Unique identifier for each task (Primary Key)"
+    )
+    uuid: str = Field(
+        default_factory=lambda: str(uuid4()),
+        description="Universally unique identifier for each task"
+    )
+    status: Optional[TaskStatus] = Field(
+        default=None,
+        sa_column=Field(sa_column=SQLAlchemyEnum(TaskStatus)),
+        description="Current status of the task",
+    )
+    result: Optional[dict] = Field(
+        default_factory=dict,
+        sa_column=Column(JSON),
+        description="JSON data representing the result of the task"
+    )
+    result_type: Optional[ResultType] = Field(
+        default=ResultType.JSON,
+        sa_column=Field(sa_column=SQLAlchemyEnum(ResultType)),
+        description="Result type whether it's a filepath or JSON"
+    )
+    file_name: Optional[str] = Field(
+        default=None,
+        description="Name of the file associated with the task"
+    )
+    url: Optional[str] = Field(
+        default=None,
+        description="URL of the file associated with the task"
+    )
+    audio_duration: Optional[float] = Field(
+        default=None,
+        description="Duration of the audio in seconds"
+    )
+    language: Optional[str] = Field(
+        default=None,
+        description="Language of the file associated with the task"
+    )
+    task_type: Optional[TaskType] = Field(
+        default=None,
+        sa_column=Field(sa_column=SQLAlchemyEnum(TaskType)),
+        description="Type/category of the task"
+    )
+    task_params: Optional[dict] = Field(
+        default_factory=dict,
+        sa_column=Column(JSON),
+        description="Parameters of the task"
+    )
+    duration: Optional[float] = Field(
+        default=None,
+        description="Duration of the task execution"
+    )
+    error: Optional[str] = Field(
+        default=None,
+        description="Error message, if any, associated with the task"
+    )
+    created_at: datetime = Field(
+        default_factory=datetime.utcnow,
+        description="Date and time of creation"
+    )
+    updated_at: datetime = Field(
+        default_factory=datetime.utcnow,
+        sa_column_kwargs={"onupdate": datetime.utcnow},
+        description="Date and time of last update"
+    )
+    def to_response(self) -> "TaskStatusResponse":
+        return TaskStatusResponse(
+            identifier=self.uuid,
+            status=self.status,
+            task_type=self.task_type,
+            result_type=self.result_type,
+            result=self.result,
+            task_params=self.task_params,
+            error=self.error,
+            duration=self.duration
+        )
+class TasksResult(BaseModel):
+    tasks: List[Task]

backend/docker-compose.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+services:
+  app:
+    build:
+      dockerfile: backend/Dockerfile
+      context: ..
+    image: jhj0517/whisper-webui-backend:latest
+    volumes:
+      # You can mount the container's volume paths to directory paths on your local machine.
+      # Models will be stored in the `./models' directory on your machine.
+      # Similarly, all output files will be stored in the `./outputs` directory.
+      # The DB file is saved in /Whisper-WebUI/backend/records.db unless you edit it in /Whisper-WebUI/backend/configs/.env
+      - ./models:/Whisper-WebUI/models
+      - ./outputs:/Whisper-WebUI/outputs
+      - ./backend:/Whisper-WebUI/backend
+    ports:
+      - "8000:8000"
+    stdin_open: true
+    tty: true
+    entrypoint: ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "8000"]
+    # If you're not using Nvidia GPU, Update device to match yours.
+    # See more info at : https://docs.docker.com/compose/compose-file/deploy/#driver
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [ gpu ]

backend/main.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from contextlib import asynccontextmanager
+from fastapi import (
+    FastAPI,
+)
+from fastapi.responses import RedirectResponse
+from fastapi.middleware.cors import CORSMiddleware
+import os
+import time
+import threading
+from backend.db.db_instance import init_db
+from backend.routers.transcription.router import transcription_router, get_pipeline
+from backend.routers.vad.router import get_vad_model, vad_router
+from backend.routers.bgm_separation.router import get_bgm_separation_inferencer, bgm_separation_router
+from backend.routers.task.router import task_router
+from backend.common.config_loader import read_env, load_server_config
+from backend.common.cache_manager import cleanup_old_files
+from modules.utils.paths import SERVER_CONFIG_PATH, BACKEND_CACHE_DIR
+def clean_cache_thread(ttl: int, frequency: int) -> threading.Thread:
+    def clean_cache(_ttl: int, _frequency: int):
+        while True:
+            cleanup_old_files(cache_dir=BACKEND_CACHE_DIR, ttl=_ttl)
+            time.sleep(_frequency)
+    return threading.Thread(
+        target=clean_cache,
+        args=(ttl, frequency),
+        daemon=True
+    )
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Basic setup initialization
+    server_config = load_server_config()
+    read_env("DB_URL")  # Place .env file into /configs/.env
+    init_db()
+    # Inferencer initialization
+    transcription_pipeline = get_pipeline()
+    vad_inferencer = get_vad_model()
+    bgm_separation_inferencer = get_bgm_separation_inferencer()
+    # Thread initialization
+    cache_thread = clean_cache_thread(server_config["cache"]["ttl"], server_config["cache"]["frequency"])
+    cache_thread.start()
+    yield
+    # Release VRAM when server shutdown
+    transcription_pipeline = None
+    vad_inferencer = None
+    bgm_separation_inferencer = None
+app = FastAPI(
+    title="Whisper-WebUI-Backend",
+    description=f"""
+    REST API for Whisper-WebUI. Swagger UI is available via /docs or root URL with redirection. Redoc is available via /redoc.
+    """,
+    version="0.0.1",
+    lifespan=lifespan,
+    openapi_tags=[
+        {
+            "name": "BGM Separation",
+            "description": "Cached files for /bgm-separation are generated in the `backend/cache` directory,"
+                           " you can set TLL for these files in `backend/configs/config.yaml`."
+        }
+    ]
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["GET", "POST", "PUT", "PATCH", "OPTIONS"],  # Disable DELETE
+    allow_headers=["*"],
+)
+app.include_router(transcription_router)
+app.include_router(vad_router)
+app.include_router(bgm_separation_router)
+app.include_router(task_router)
+@app.get("/", response_class=RedirectResponse, include_in_schema=False)
+async def index():
+    """
+    Redirect to the documentation. Defaults to Swagger UI.
+    You can also check the /redoc with redoc style: https://github.com/Redocly/redoc
+    """
+    return "/docs"

backend/nginx/logs/logs_are_generated_here ADDED Viewed

File without changes

backend/nginx/nginx.conf ADDED Viewed

	@@ -0,0 +1,23 @@

+worker_processes 1;
+events {
+    worker_connections 1024;
+}
+http {
+    server {
+        listen 80;
+        client_max_body_size 4G;
+        server_name your-own-domain-name.com;
+        location / {
+            proxy_pass http://127.0.0.1:8000;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+        }
+    }
+}

backend/nginx/temp/temps_are_generated_here ADDED Viewed

File without changes

backend/requirements-backend.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+# Whisper-WebUI dependencies
+-r ../requirements.txt
+# Backend dependencies
+python-dotenv
+uvicorn
+SQLAlchemy
+sqlmodel
+pydantic
+# Test dependencies
+# pytest
+# pytest-asyncio

backend/routers/__init__.py ADDED Viewed

File without changes

backend/routers/bgm_separation/__init__.py ADDED Viewed

File without changes

backend/routers/bgm_separation/models.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from pydantic import BaseModel, Field
+class BGMSeparationResult(BaseModel):
+    instrumental_hash: str = Field(..., description="Instrumental file hash")
+    vocal_hash: str = Field(..., description="Vocal file hash")

backend/routers/bgm_separation/router.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import functools
+import numpy as np
+from fastapi import (
+    File,
+    UploadFile,
+)
+import gradio as gr
+from fastapi import APIRouter, BackgroundTasks, Depends, Response, status
+from fastapi.responses import FileResponse
+from typing import List, Dict, Tuple
+from datetime import datetime
+import os
+from modules.whisper.data_classes import *
+from modules.uvr.music_separator import MusicSeparator
+from modules.utils.paths import BACKEND_CACHE_DIR
+from backend.common.audio import read_audio
+from backend.common.models import QueueResponse
+from backend.common.config_loader import load_server_config
+from backend.common.compresser import get_file_hash, find_file_by_hash
+from backend.db.task.models import TaskStatus, TaskType, ResultType
+from backend.db.task.dao import add_task_to_db, update_task_status_in_db
+from .models import BGMSeparationResult
+bgm_separation_router = APIRouter(prefix="/bgm-separation", tags=["BGM Separation"])
+@functools.lru_cache
+def get_bgm_separation_inferencer() -> 'MusicSeparator':
+    config = load_server_config()["bgm_separation"]
+    inferencer = MusicSeparator(
+        output_dir=os.path.join(BACKEND_CACHE_DIR, "UVR")
+    )
+    inferencer.update_model(
+        model_name=config["model_size"],
+        device=config["device"]
+    )
+    return inferencer
+def run_bgm_separation(
+    audio: np.ndarray,
+    params: BGMSeparationParams,
+    identifier: str,
+) -> Tuple[np.ndarray, np.ndarray]:
+    update_task_status_in_db(
+        identifier=identifier,
+        update_data={
+            "uuid": identifier,
+            "status": TaskStatus.IN_PROGRESS,
+            "updated_at": datetime.utcnow()
+        }
+    )
+    start_time = datetime.utcnow()
+    instrumental, vocal, filepaths = get_bgm_separation_inferencer().separate(
+        audio=audio,
+        model_name=params.uvr_model_size,
+        device=params.uvr_device,
+        segment_size=params.segment_size,
+        save_file=True,
+        progress=gr.Progress()
+    )
+    instrumental_path, vocal_path = filepaths
+    elapsed_time = (datetime.utcnow() - start_time).total_seconds()
+    update_task_status_in_db(
+        identifier=identifier,
+        update_data={
+            "uuid": identifier,
+            "status": TaskStatus.COMPLETED,
+            "result": BGMSeparationResult(
+                instrumental_hash=get_file_hash(instrumental_path),
+                vocal_hash=get_file_hash(vocal_path)
+            ).model_dump(),
+            "result_type": ResultType.FILEPATH,
+            "updated_at": datetime.utcnow(),
+            "duration": elapsed_time
+        }
+    )
+    return instrumental, vocal
+@bgm_separation_router.post(
+    "/",
+    response_model=QueueResponse,
+    status_code=status.HTTP_201_CREATED,
+    summary="Separate Background BGM abd vocal",
+    description="Separate background music and vocal from an uploaded audio or video file.",
+)
+async def bgm_separation(
+    background_tasks: BackgroundTasks,
+    file: UploadFile = File(..., description="Audio or video file to separate background music."),
+    params: BGMSeparationParams = Depends()
+) -> QueueResponse:
+    if not isinstance(file, np.ndarray):
+        audio, info = await read_audio(file=file)
+    else:
+        audio, info = file, None
+    identifier = add_task_to_db(
+        status=TaskStatus.QUEUED,
+        file_name=file.filename,
+        audio_duration=info.duration if info else None,
+        task_type=TaskType.BGM_SEPARATION,
+        task_params=params.model_dump(),
+    )
+    background_tasks.add_task(
+        run_bgm_separation,
+        audio=audio,
+        params=params,
+        identifier=identifier
+    )
+    return QueueResponse(identifier=identifier, status=TaskStatus.QUEUED, message="BGM Separation task has queued")

backend/routers/task/__init__.py ADDED Viewed

File without changes

backend/routers/task/router.py ADDED Viewed

	@@ -0,0 +1,130 @@

+from fastapi import APIRouter, Depends, HTTPException, status
+from fastapi.responses import FileResponse
+from sqlalchemy.orm import Session
+import os
+from backend.db.db_instance import get_db_session
+from backend.db.task.dao import (
+    get_task_status_from_db,
+    get_all_tasks_status_from_db,
+    delete_task_from_db,
+)
+from backend.db.task.models import (
+    TasksResult,
+    Task,
+    TaskStatusResponse,
+    TaskType
+)
+from backend.common.models import (
+    Response,
+)
+from backend.common.compresser import compress_files, find_file_by_hash
+from modules.utils.paths import BACKEND_CACHE_DIR
+task_router = APIRouter(prefix="/task", tags=["Tasks"])
+@task_router.get(
+    "/{identifier}",
+    response_model=TaskStatusResponse,
+    status_code=status.HTTP_200_OK,
+    summary="Retrieve Task by Identifier",
+    description="Retrieve the specific task by its identifier.",
+)
+async def get_task(
+    identifier: str,
+    session: Session = Depends(get_db_session),
+) -> TaskStatusResponse:
+    """
+    Retrieve the specific task by its identifier.
+    """
+    task = get_task_status_from_db(identifier=identifier, session=session)
+    if task is not None:
+        return task.to_response()
+    else:
+        raise HTTPException(status_code=404, detail="Identifier not found")
+@task_router.get(
+    "/file/{identifier}",
+    status_code=status.HTTP_200_OK,
+    summary="Retrieve FileResponse Task by Identifier",
+    description="Retrieve the file response task by its identifier. You can use this endpoint if you need to download"
+                " The file as a response",
+)
+async def get_file_task(
+    identifier: str,
+    session: Session = Depends(get_db_session),
+) -> FileResponse:
+    """
+    Retrieve the downloadable file response of a specific task by its identifier.
+    Compressed by ZIP basically.
+    """
+    task = get_task_status_from_db(identifier=identifier, session=session)
+    if task is not None:
+        if task.task_type == TaskType.BGM_SEPARATION:
+            output_zip_path = os.path.join(BACKEND_CACHE_DIR, f"{identifier}_bgm_separation.zip")
+            instrumental_path = find_file_by_hash(
+                os.path.join(BACKEND_CACHE_DIR, "UVR", "instrumental"),
+                task.result["instrumental_hash"]
+            )
+            vocal_path = find_file_by_hash(
+                os.path.join(BACKEND_CACHE_DIR, "UVR", "vocals"),
+                task.result["vocal_hash"]
+            )
+            output_zip_path = compress_files(
+                [instrumental_path, vocal_path],
+                output_zip_path
+            )
+            return FileResponse(
+                path=output_zip_path,
+                status_code=200,
+                filename=output_zip_path,
+                media_type="application/zip"
+            )
+        else:
+            raise HTTPException(status_code=404, detail=f"File download is only supported for bgm separation."
+                                                        f" The given type is {task.task_type}")
+    else:
+        raise HTTPException(status_code=404, detail="Identifier not found")
+# Delete method, commented by default because this endpoint is likely to require special permissions
+# @task_router.delete(
+#     "/{identifier}",
+#     response_model=Response,
+#     status_code=status.HTTP_200_OK,
+#     summary="Delete Task by Identifier",
+#     description="Delete a task from the system using its identifier.",
+# )
+async def delete_task(
+    identifier: str,
+    session: Session = Depends(get_db_session),
+) -> Response:
+    """
+    Delete a task by its identifier.
+    """
+    if delete_task_from_db(identifier, session):
+        return Response(identifier=identifier, message="Task deleted")
+    else:
+        raise HTTPException(status_code=404, detail="Task not found")
+# Get All method, commented by default because this endpoint is likely to require special permissions
+# @task_router.get(
+#     "/all",
+#     response_model=TasksResult,
+#     status_code=status.HTTP_200_OK,
+#     summary="Retrieve All Task Statuses",
+#     description="Retrieve the statuses of all tasks available in the system.",
+# )
+async def get_all_tasks_status(
+    session: Session = Depends(get_db_session),
+) -> TasksResult:
+    """
+    Retrieve all tasks.
+    """
+    return get_all_tasks_status_from_db(session=session)

backend/routers/transcription/__init__.py ADDED Viewed

File without changes

backend/routers/transcription/router.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import functools
+import uuid
+import numpy as np
+from fastapi import (
+    File,
+    UploadFile,
+)
+import gradio as gr
+from fastapi import APIRouter, BackgroundTasks, Depends, Response, status
+from typing import List, Dict
+from sqlalchemy.orm import Session
+from datetime import datetime
+from modules.whisper.data_classes import *
+from modules.utils.paths import BACKEND_CACHE_DIR
+from modules.whisper.faster_whisper_inference import FasterWhisperInference
+from backend.common.audio import read_audio
+from backend.common.models import QueueResponse
+from backend.common.config_loader import load_server_config
+from backend.db.task.dao import (
+    add_task_to_db,
+    get_db_session,
+    update_task_status_in_db
+)
+from backend.db.task.models import TaskStatus, TaskType
+transcription_router = APIRouter(prefix="/transcription", tags=["Transcription"])
+@functools.lru_cache
+def get_pipeline() -> 'FasterWhisperInference':
+    config = load_server_config()["whisper"]
+    inferencer = FasterWhisperInference(
+        output_dir=BACKEND_CACHE_DIR
+    )
+    inferencer.update_model(
+        model_size=config["model_size"],
+        compute_type=config["compute_type"]
+    )
+    return inferencer
+def run_transcription(
+    audio: np.ndarray,
+    params: TranscriptionPipelineParams,
+    identifier: str,
+) -> List[Segment]:
+    update_task_status_in_db(
+        identifier=identifier,
+        update_data={
+            "uuid": identifier,
+            "status": TaskStatus.IN_PROGRESS,
+            "updated_at": datetime.utcnow()
+        },
+    )
+    segments, elapsed_time = get_pipeline().run(
+        audio,
+        gr.Progress(),
+        "SRT",
+        False,
+        *params.to_list()
+    )
+    segments = [seg.model_dump() for seg in segments]
+    update_task_status_in_db(
+        identifier=identifier,
+        update_data={
+            "uuid": identifier,
+            "status": TaskStatus.COMPLETED,
+            "result": segments,
+            "updated_at": datetime.utcnow(),
+            "duration": elapsed_time
+        },
+    )
+    return segments
+@transcription_router.post(
+    "/",
+    response_model=QueueResponse,
+    status_code=status.HTTP_201_CREATED,
+    summary="Transcribe Audio",
+    description="Process the provided audio or video file to generate a transcription.",
+)
+async def transcription(
+    background_tasks: BackgroundTasks,
+    file: UploadFile = File(..., description="Audio or video file to transcribe."),
+    whisper_params: WhisperParams = Depends(),
+    vad_params: VadParams = Depends(),
+    bgm_separation_params: BGMSeparationParams = Depends(),
+    diarization_params: DiarizationParams = Depends(),
+) -> QueueResponse:
+    if not isinstance(file, np.ndarray):
+        audio, info = await read_audio(file=file)
+    else:
+        audio, info = file, None
+    params = TranscriptionPipelineParams(
+        whisper=whisper_params,
+        vad=vad_params,
+        bgm_separation=bgm_separation_params,
+        diarization=diarization_params
+    )
+    identifier = add_task_to_db(
+        status=TaskStatus.QUEUED,
+        file_name=file.filename,
+        audio_duration=info.duration if info else None,
+        language=params.whisper.lang,
+        task_type=TaskType.TRANSCRIPTION,
+        task_params=params.to_dict(),
+    )
+    background_tasks.add_task(
+        run_transcription,
+        audio=audio,
+        params=params,
+        identifier=identifier,
+    )
+    return QueueResponse(identifier=identifier, status=TaskStatus.QUEUED, message="Transcription task has queued")

backend/routers/vad/__init__.py ADDED Viewed

File without changes

backend/routers/vad/router.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import functools
+import numpy as np
+from faster_whisper.vad import VadOptions
+from fastapi import (
+    File,
+    UploadFile,
+)
+from fastapi import APIRouter, BackgroundTasks, Depends, Response, status
+from typing import List, Dict
+from datetime import datetime
+from modules.vad.silero_vad import SileroVAD
+from modules.whisper.data_classes import VadParams
+from backend.common.audio import read_audio
+from backend.common.models import QueueResponse
+from backend.db.task.dao import add_task_to_db, update_task_status_in_db
+from backend.db.task.models import TaskStatus, TaskType
+vad_router = APIRouter(prefix="/vad", tags=["Voice Activity Detection"])
+@functools.lru_cache
+def get_vad_model() -> SileroVAD:
+    inferencer = SileroVAD()
+    inferencer.update_model()
+    return inferencer
+def run_vad(
+    audio: np.ndarray,
+    params: VadOptions,
+    identifier: str,
+) -> List[Dict]:
+    update_task_status_in_db(
+        identifier=identifier,
+        update_data={
+            "uuid": identifier,
+            "status": TaskStatus.IN_PROGRESS,
+            "updated_at": datetime.utcnow()
+        }
+    )
+    start_time = datetime.utcnow()
+    audio, speech_chunks = get_vad_model().run(
+        audio=audio,
+        vad_parameters=params
+    )
+    elapsed_time = (datetime.utcnow() - start_time).total_seconds()
+    update_task_status_in_db(
+        identifier=identifier,
+        update_data={
+            "uuid": identifier,
+            "status": TaskStatus.COMPLETED,
+            "updated_at": datetime.utcnow(),
+            "result": speech_chunks,
+            "duration": elapsed_time
+        }
+    )
+    return speech_chunks
+@vad_router.post(
+    "/",
+    response_model=QueueResponse,
+    status_code=status.HTTP_201_CREATED,
+    summary="Voice Activity Detection",
+    description="Detect voice parts in the provided audio or video file to generate a timeline of speech segments.",
+)
+async def vad(
+    background_tasks: BackgroundTasks,
+    file: UploadFile = File(..., description="Audio or video file to detect voices."),
+    params: VadParams = Depends()
+) -> QueueResponse:
+    if not isinstance(file, np.ndarray):
+        audio, info = await read_audio(file=file)
+    else:
+        audio, info = file, None
+    vad_options = VadOptions(
+        threshold=params.threshold,
+        min_speech_duration_ms=params.min_speech_duration_ms,
+        max_speech_duration_s=params.max_speech_duration_s,
+        min_silence_duration_ms=params.min_silence_duration_ms,
+        speech_pad_ms=params.speech_pad_ms
+    )
+    identifier = add_task_to_db(
+        status=TaskStatus.QUEUED,
+        file_name=file.filename,
+        audio_duration=info.duration if info else None,
+        task_type=TaskType.VAD,
+        task_params=params.model_dump(),
+    )
+    background_tasks.add_task(run_vad, audio=audio, params=vad_options, identifier=identifier)
+    return QueueResponse(identifier=identifier, status=TaskStatus.QUEUED, message="VAD task has queued")

backend/tests/__init__.py ADDED Viewed

File without changes

backend/tests/test_backend_bgm_separation.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import pytest
+from fastapi import UploadFile
+from io import BytesIO
+import os
+import torch
+from backend.db.task.models import TaskStatus
+from backend.tests.test_task_status import wait_for_task_completion, fetch_file_response
+from backend.tests.test_backend_config import (
+    get_client, setup_test_file, get_upload_file_instance, calculate_wer,
+    TEST_BGM_SEPARATION_PARAMS, TEST_ANSWER, TEST_BGM_SEPARATION_OUTPUT_PATH
+)
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="Skip the test because CUDA is not available")
+@pytest.mark.parametrize(
+    "bgm_separation_params",
+    [
+        TEST_BGM_SEPARATION_PARAMS
+    ]
+)
+def test_transcription_endpoint(
+    get_upload_file_instance,
+    bgm_separation_params: dict
+):
+    client = get_client()
+    file_content = BytesIO(get_upload_file_instance.file.read())
+    get_upload_file_instance.file.seek(0)
+    response = client.post(
+        "/bgm-separation",
+        files={"file": (get_upload_file_instance.filename, file_content, "audio/mpeg")},
+        params=bgm_separation_params
+    )
+    assert response.status_code == 201
+    assert response.json()["status"] == TaskStatus.QUEUED
+    task_identifier = response.json()["identifier"]
+    assert isinstance(task_identifier, str) and task_identifier
+    completed_task = wait_for_task_completion(
+        identifier=task_identifier
+    )
+    assert completed_task is not None, f"Task with identifier {task_identifier} did not complete within the " \
+                                       f"expected time."
+    result = completed_task.json()["result"]
+    assert "instrumental_hash" in result and result["instrumental_hash"]
+    assert "vocal_hash" in result and result["vocal_hash"]
+    file_response = fetch_file_response(task_identifier)
+    assert file_response.status_code == 200, f"Fetching File Response has failed. Response is: {file_response}"
+    with open(TEST_BGM_SEPARATION_OUTPUT_PATH, "wb") as file:
+        file.write(file_response.content)
+    assert os.path.exists(TEST_BGM_SEPARATION_OUTPUT_PATH)

backend/tests/test_backend_config.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import functools
+from fastapi import FastAPI, UploadFile
+from fastapi.testclient import TestClient
+from starlette.datastructures import UploadFile as StarletteUploadFile
+from io import BytesIO
+import os
+import requests
+import pytest
+import yaml
+import jiwer
+from backend.main import app
+from modules.whisper.data_classes import *
+from modules.utils.paths import *
+from modules.utils.files_manager import load_yaml, save_yaml
+TEST_PIPELINE_PARAMS = {**WhisperParams(model_size="tiny", compute_type="float32").model_dump(exclude_none=True),
+                        **VadParams().model_dump(exclude_none=True),
+                        **BGMSeparationParams().model_dump(exclude_none=True),
+                        **DiarizationParams().model_dump(exclude_none=True)}
+TEST_VAD_PARAMS = VadParams().model_dump()
+TEST_BGM_SEPARATION_PARAMS = BGMSeparationParams().model_dump()
+TEST_FILE_DOWNLOAD_URL = "https://github.com/jhj0517/whisper_flutter_new/raw/main/example/assets/jfk.wav"
+TEST_FILE_PATH = os.path.join(WEBUI_DIR, "backend", "tests", "jfk.wav")
+TEST_BGM_SEPARATION_OUTPUT_PATH = os.path.join(WEBUI_DIR, "backend", "tests", "separated_audio.zip")
+TEST_ANSWER = "And so my fellow Americans ask not what your country can do for you ask what you can do for your country"
+TEST_WHISPER_MODEL = "tiny"
+TEST_COMPUTE_TYPE = "float32"
+@pytest.fixture(autouse=True)
+def setup_test_file():
+    @functools.lru_cache
+    def download_file(url=TEST_FILE_DOWNLOAD_URL, file_path=TEST_FILE_PATH):
+        if os.path.exists(file_path):
+            return
+        if not os.path.exists(os.path.dirname(file_path)):
+            os.makedirs(os.path.dirname(file_path))
+        response = requests.get(url)
+        with open(file_path, "wb") as file:
+            file.write(response.content)
+        print(f"File downloaded to: {file_path}")
+    download_file(TEST_FILE_DOWNLOAD_URL, TEST_FILE_PATH)
+@pytest.fixture
+@functools.lru_cache
+def get_upload_file_instance(filepath: str = TEST_FILE_PATH) -> UploadFile:
+    with open(filepath, "rb") as f:
+        file_contents = BytesIO(f.read())
+        filename = os.path.basename(filepath)
+        upload_file = StarletteUploadFile(file=file_contents, filename=filename)
+    return upload_file
+@functools.lru_cache
+def get_client(app: FastAPI = app):
+    return TestClient(app)
+def calculate_wer(answer, prediction):
+    return jiwer.wer(answer, prediction)

backend/tests/test_backend_transcription.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import pytest
+from fastapi import UploadFile
+from io import BytesIO
+from backend.db.task.models import TaskStatus
+from backend.tests.test_task_status import wait_for_task_completion
+from backend.tests.test_backend_config import (
+    get_client, setup_test_file, get_upload_file_instance, calculate_wer,
+    TEST_PIPELINE_PARAMS, TEST_ANSWER
+)
+@pytest.mark.parametrize(
+    "pipeline_params",
+    [
+        TEST_PIPELINE_PARAMS
+    ]
+)
+def test_transcription_endpoint(
+    get_upload_file_instance,
+    pipeline_params: dict
+):
+    client = get_client()
+    file_content = BytesIO(get_upload_file_instance.file.read())
+    get_upload_file_instance.file.seek(0)
+    response = client.post(
+        "/transcription",
+        files={"file": (get_upload_file_instance.filename, file_content, "audio/mpeg")},
+        params=pipeline_params
+    )
+    assert response.status_code == 201
+    assert response.json()["status"] == TaskStatus.QUEUED
+    task_identifier = response.json()["identifier"]
+    assert isinstance(task_identifier, str) and task_identifier
+    completed_task = wait_for_task_completion(
+        identifier=task_identifier
+    )
+    assert completed_task is not None, f"Task with identifier {task_identifier} did not complete within the " \
+                                       f"expected time."
+    result = completed_task.json()["result"]
+    assert result, "Transcription text is empty"
+    wer = calculate_wer(TEST_ANSWER, result[0]["text"].strip().replace(",", "").replace(".", ""))
+    assert wer < 0.1, f"WER is too high, it's {wer}"