diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..94cfb9d7a9324c457e117cf2b403fca79b88b302
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,10 @@
+# from .gitignore
+venv/
+ui/__pycache__/
+outputs/
+modules/__pycache__/
+models/
+modules/yt_tmp.wav
+
+.git
+.github
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e1e01f8cf68969bacc8af57d793bea9a4b0a3c6f
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1,13 @@
+# These are supported funding model platforms
+
+github: []
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: jhj0517
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+otechie: # Replace with a single Otechie username
+lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
+custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000000000000000000000000000000000000..99e25a330c8f674a17c5431a323b146c95d265f8
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,11 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: bug
+assignees: jhj0517
+
+---
+
+**Which OS are you using?**
+ - OS: [e.g. iOS or Windows.. If you are using Google Colab, just Colab.]
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000000000000000000000000000000000000..74981022b47d3038fdb054c3cf338d93232012ba
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,10 @@
+---
+name: Feature request
+about: Any feature you want
+title: ''
+labels: enhancement
+assignees: jhj0517
+
+---
+
+
diff --git a/.github/ISSUE_TEMPLATE/hallucination.md b/.github/ISSUE_TEMPLATE/hallucination.md
new file mode 100644
index 0000000000000000000000000000000000000000..ba43584f7765d84d41a565d96d8b76c3f187414e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/hallucination.md
@@ -0,0 +1,12 @@
+---
+name: Hallucination
+about: Whisper hallucinations. ( Repeating certain words or subtitles starting too
+ early, etc. )
+title: ''
+labels: hallucination
+assignees: jhj0517
+
+---
+
+**Download URL for sample audio**
+- Please upload download URL for sample audio file so I can test with some settings for better result. You can use https://easyupload.io/ or any other service to share.
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 0000000000000000000000000000000000000000..d33d497792bd3ca3415376b6b8daf32835b04692
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,5 @@
+## Related issues
+- #0
+
+## Changed
+1. Changes
diff --git a/.github/workflows/ci-shell.yml b/.github/workflows/ci-shell.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7f8e77a9f3b5a66445cae629225576a012fa69b4
--- /dev/null
+++ b/.github/workflows/ci-shell.yml
@@ -0,0 +1,43 @@
+name: CI-Shell Script
+
+on:
+ workflow_dispatch:
+
+ push:
+ branches:
+ - master
+ pull_request:
+ branches:
+ - master
+
+jobs:
+ test-shell-script:
+
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python: [ "3.10" ]
+
+ steps:
+ - name: Clean up space for action
+ run: rm -rf /opt/hostedtoolcache
+
+ - uses: actions/checkout@v4
+ - name: Setup Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python }}
+
+ - name: Install git and ffmpeg
+ run: sudo apt-get update && sudo apt-get install -y git ffmpeg
+
+ - name: Execute Install.sh
+ run: |
+ chmod +x ./Install.sh
+ ./Install.sh
+
+ - name: Execute start-webui.sh
+ run: |
+ chmod +x ./start-webui.sh
+ timeout 60s ./start-webui.sh || true
+
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..33a084802a8302a86f32c31ad57b4b480d59ed24
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,41 @@
+name: CI
+
+on:
+ workflow_dispatch:
+
+ push:
+ branches:
+ - master
+ pull_request:
+ branches:
+ - master
+
+jobs:
+ build:
+
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python: ["3.10"]
+
+ env:
+ DEEPL_API_KEY: ${{ secrets.DEEPL_API_KEY }}
+
+ steps:
+ - name: Clean up space for action
+ run: rm -rf /opt/hostedtoolcache
+
+ - uses: actions/checkout@v4
+ - name: Setup Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python }}
+
+ - name: Install git and ffmpeg
+ run: sudo apt-get update && sudo apt-get install -y git ffmpeg
+
+ - name: Install dependencies
+ run: pip install -r requirements.txt pytest
+
+ - name: Run test
+ run: python -m pytest -rs tests
\ No newline at end of file
diff --git a/.github/workflows/publish-docker.yml b/.github/workflows/publish-docker.yml
new file mode 100644
index 0000000000000000000000000000000000000000..99da1b6da36e42b5eebc872bf2a05d6118fc7e50
--- /dev/null
+++ b/.github/workflows/publish-docker.yml
@@ -0,0 +1,37 @@
+name: Publish to Docker Hub
+
+on:
+ push:
+ branches:
+ - master
+
+jobs:
+ build-and-push:
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Log in to Docker Hub
+ uses: docker/login-action@v2
+ with:
+ username: ${{ secrets.DOCKER_USERNAME }}
+ password: ${{ secrets.DOCKER_PASSWORD }}
+
+ - name: Checkout repository
+ uses: actions/checkout@v3
+
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@v3
+
+ - name: Set up QEMU
+ uses: docker/setup-qemu-action@v3
+
+ - name: Build and push Docker image
+ uses: docker/build-push-action@v5
+ with:
+ context: .
+ file: ./Dockerfile
+ push: true
+ tags: ${{ secrets.DOCKER_USERNAME }}/whisper-webui:latest
+
+ - name: Log out of Docker Hub
+ run: docker logout
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..9cd9e3837fec930d3df2ff4430af26f40e672a82
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,13 @@
+*.wav
+*.png
+*.mp4
+*.mp3
+.idea/
+.pytest_cache/
+venv/
+modules/ui/__pycache__/
+outputs/
+modules/__pycache__/
+models/
+modules/yt_tmp.wav
+configs/default_parameters.yaml
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..5604def4a9c7cf253d1d2fcbbcf005615691cf69
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,34 @@
+FROM debian:bookworm-slim AS builder
+
+RUN apt-get update && \
+ apt-get install -y curl git python3 python3-pip python3-venv && \
+ rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* && \
+ mkdir -p /Whisper-WebUI
+
+WORKDIR /Whisper-WebUI
+
+COPY requirements.txt .
+
+RUN python3 -m venv venv && \
+ . venv/bin/activate && \
+ pip install --no-cache-dir -r requirements.txt
+
+
+FROM debian:bookworm-slim AS runtime
+
+RUN apt-get update && \
+ apt-get install -y curl ffmpeg python3 && \
+ rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
+
+WORKDIR /Whisper-WebUI
+
+COPY . .
+COPY --from=builder /Whisper-WebUI/venv /Whisper-WebUI/venv
+
+VOLUME [ "/Whisper-WebUI/models" ]
+VOLUME [ "/Whisper-WebUI/outputs" ]
+
+ENV PATH="/Whisper-WebUI/venv/bin:$PATH"
+ENV LD_LIBRARY_PATH=/Whisper-WebUI/venv/lib64/python3.11/site-packages/nvidia/cublas/lib:/Whisper-WebUI/venv/lib64/python3.11/site-packages/nvidia/cudnn/lib
+
+ENTRYPOINT [ "python", "app.py" ]
diff --git a/Install.bat b/Install.bat
new file mode 100644
index 0000000000000000000000000000000000000000..7c3f496a2091ba89b3e6f8582cbfaa35d50b7b19
--- /dev/null
+++ b/Install.bat
@@ -0,0 +1,20 @@
+@echo off
+
+if not exist "%~dp0\venv\Scripts" (
+ echo Creating venv...
+ python -m venv venv
+)
+echo checked the venv folder. now installing requirements..
+
+call "%~dp0\venv\scripts\activate"
+
+pip install -r requirements.txt
+
+if errorlevel 1 (
+ echo.
+ echo Requirements installation failed. please remove venv folder and run install.bat again.
+) else (
+ echo.
+ echo Requirements installed successfully.
+)
+pause
\ No newline at end of file
diff --git a/Install.sh b/Install.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6ba3148ebd904101496e7198040d65569c1fb6b5
--- /dev/null
+++ b/Install.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+if [ ! -d "venv" ]; then
+ echo "Creating virtual environment..."
+ python -m venv venv
+fi
+
+source venv/bin/activate
+
+pip install -r requirements.txt && echo "Requirements installed successfully." || {
+ echo ""
+ echo "Requirements installation failed. Please remove the venv folder and run the script again."
+ deactivate
+ exit 1
+}
+
+deactivate
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..dd84d44b86260eb3206817f7c184f0534c1bd5a8
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright 2023 jhj0517
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/README.md b/README.md
index 0f8816a69a7e9d6cc1402378fce440682bbfec29..af4b32a98373691f297ee0c1f2805c4b6f7a6f03 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,117 @@
----
-title: Whisper WebUI
-emoji: π
-colorFrom: red
-colorTo: pink
-sdk: gradio
-sdk_version: 5.5.0
-app_file: app.py
-pinned: false
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Whisper-WebUI
+A Gradio-based browser interface for [Whisper](https://github.com/openai/whisper). You can use it as an Easy Subtitle Generator!
+
+
+
+## Notebook
+If you wish to try this on Colab, you can do it in [here](https://colab.research.google.com/github/jhj0517/Whisper-WebUI/blob/master/notebook/whisper-webui.ipynb)!
+
+# Feature
+- Select the Whisper implementation you want to use between :
+ - [openai/whisper](https://github.com/openai/whisper)
+ - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) (used by default)
+ - [Vaibhavs10/insanely-fast-whisper](https://github.com/Vaibhavs10/insanely-fast-whisper)
+- Generate subtitles from various sources, including :
+ - Files
+ - Youtube
+ - Microphone
+- Currently supported subtitle formats :
+ - SRT
+ - WebVTT
+ - txt ( only text file without timeline )
+- Speech to Text Translation
+ - From other languages to English. ( This is Whisper's end-to-end speech-to-text translation feature )
+- Text to Text Translation
+ - Translate subtitle files using Facebook NLLB models
+ - Translate subtitle files using DeepL API
+- Pre-processing audio input with [Silero VAD](https://github.com/snakers4/silero-vad).
+- Pre-processing audio input to separate BGM with [UVR](https://github.com/Anjok07/ultimatevocalremovergui), [UVR-api](https://github.com/NextAudioGen/ultimatevocalremover_api).
+- Post-processing with speaker diarization using the [pyannote](https://huggingface.co/pyannote/speaker-diarization-3.1) model.
+ - To download the pyannote model, you need to have a Huggingface token and manually accept their terms in the pages below.
+ 1. https://huggingface.co/pyannote/speaker-diarization-3.1
+ 2. https://huggingface.co/pyannote/segmentation-3.0
+
+# Installation and Running
+### Prerequisite
+To run this WebUI, you need to have `git`, `python` version 3.8 ~ 3.10, `FFmpeg`.
+And if you're not using an Nvida GPU, or using a different `CUDA` version than 12.4, edit the [`requirements.txt`](https://github.com/jhj0517/Whisper-WebUI/blob/master/requirements.txt) to match your environment.
+
+Please follow the links below to install the necessary software:
+- git : [https://git-scm.com/downloads](https://git-scm.com/downloads)
+- python : [https://www.python.org/downloads/](https://www.python.org/downloads/) **( If your python version is too new, torch will not install properly.)**
+- FFmpeg : [https://ffmpeg.org/download.html](https://ffmpeg.org/download.html)
+- CUDA : [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads)
+
+After installing FFmpeg, **make sure to add the `FFmpeg/bin` folder to your system PATH!**
+
+### Automatic Installation
+
+1. Download `Whisper-WebUI.zip` with the file corresponding to your OS from [v1.0.0](https://github.com/jhj0517/Whisper-WebUI/releases/tag/v1.0.0) and extract its contents.
+2. Run `install.bat` or `install.sh` to install dependencies. (This will create a `venv` directory and install dependencies there.)
+3. Start WebUI with `start-webui.bat` or `start-webui.sh`
+4. To update the WebUI, run `update.bat` or `update.sh`
+
+And you can also run the project with command line arguments if you like to, see [wiki](https://github.com/jhj0517/Whisper-WebUI/wiki/Command-Line-Arguments) for a guide to arguments.
+
+- ## Running with Docker
+
+1. Install and launch [Docker-Desktop](https://www.docker.com/products/docker-desktop/).
+
+2. Git clone the repository
+
+```sh
+git clone https://github.com/jhj0517/Whisper-WebUI.git
+```
+
+3. Build the image ( Image is about 7GB~ )
+
+```sh
+docker compose build
+```
+
+4. Run the container
+
+```sh
+docker compose up
+```
+
+5. Connect to the WebUI with your browser at `http://localhost:7860`
+
+If needed, update the [`docker-compose.yaml`](https://github.com/jhj0517/Whisper-WebUI/blob/master/docker-compose.yaml) to match your environment.
+
+# VRAM Usages
+This project is integrated with [faster-whisper](https://github.com/guillaumekln/faster-whisper) by default for better VRAM usage and transcription speed.
+
+According to faster-whisper, the efficiency of the optimized whisper model is as follows:
+| Implementation | Precision | Beam size | Time | Max. GPU memory | Max. CPU memory |
+|-------------------|-----------|-----------|-------|-----------------|-----------------|
+| openai/whisper | fp16 | 5 | 4m30s | 11325MB | 9439MB |
+| faster-whisper | fp16 | 5 | 54s | 4755MB | 3244MB |
+
+If you want to use an implementation other than faster-whisper, use `--whisper_type` arg and the repository name.
+Read [wiki](https://github.com/jhj0517/Whisper-WebUI/wiki/Command-Line-Arguments) for more info about CLI args.
+
+## Available models
+This is Whisper's original VRAM usage table for models.
+
+| Size | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
+|:------:|:----------:|:------------------:|:------------------:|:-------------:|:--------------:|
+| tiny | 39 M | `tiny.en` | `tiny` | ~1 GB | ~32x |
+| base | 74 M | `base.en` | `base` | ~1 GB | ~16x |
+| small | 244 M | `small.en` | `small` | ~2 GB | ~6x |
+| medium | 769 M | `medium.en` | `medium` | ~5 GB | ~2x |
+| large | 1550 M | N/A | `large` | ~10 GB | 1x |
+
+
+`.en` models are for English only, and the cool thing is that you can use the `Translate to English` option from the "large" models!
+
+## TODOπ
+
+- [x] Add DeepL API translation
+- [x] Add NLLB Model translation
+- [x] Integrate with faster-whisper
+- [x] Integrate with insanely-fast-whisper
+- [x] Integrate with whisperX ( Only speaker diarization part )
+- [x] Add background music separation pre-processing with [UVR](https://github.com/Anjok07/ultimatevocalremovergui)
+- [ ] Add fast api script
+- [ ] Support real-time transcription for microphone
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..46bb0e9febf8039f84f2d4483ac9b5fc15fb8c4a
--- /dev/null
+++ b/app.py
@@ -0,0 +1,359 @@
+import os
+import argparse
+import gradio as gr
+import yaml
+
+from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
+ INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
+ UVR_MODELS_DIR)
+from modules.utils.files_manager import load_yaml
+from modules.whisper.whisper_factory import WhisperFactory
+from modules.whisper.faster_whisper_inference import FasterWhisperInference
+from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
+from modules.translation.nllb_inference import NLLBInference
+from modules.ui.htmls import *
+from modules.utils.cli_manager import str2bool
+from modules.utils.youtube_manager import get_ytmetas
+from modules.translation.deepl_api import DeepLAPI
+from modules.whisper.whisper_parameter import *
+
+### Device info ###
+import torch
+import torchaudio
+import torch.cuda as cuda
+import platform
+from transformers import __version__ as transformers_version
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+num_gpus = cuda.device_count() if torch.cuda.is_available() else 0
+cuda_version = torch.version.cuda if torch.cuda.is_available() else "N/A"
+cudnn_version = torch.backends.cudnn.version() if torch.cuda.is_available() else "N/A"
+os_info = platform.system() + " " + platform.release() + " " + platform.machine()
+
+# Get the available VRAM for each GPU (if available)
+vram_info = []
+if torch.cuda.is_available():
+ for i in range(cuda.device_count()):
+ gpu_properties = cuda.get_device_properties(i)
+ vram_info.append(f"**GPU {i}: {gpu_properties.total_memory / 1024**3:.2f} GB**")
+
+pytorch_version = torch.__version__
+torchaudio_version = torchaudio.__version__ if 'torchaudio' in dir() else "N/A"
+
+device_info = f"""Running on: **{device}**
+
+ Number of GPUs available: **{num_gpus}**
+
+ CUDA version: **{cuda_version}**
+
+ CuDNN version: **{cudnn_version}**
+
+ PyTorch version: **{pytorch_version}**
+
+ Torchaudio version: **{torchaudio_version}**
+
+ Transformers version: **{transformers_version}**
+
+ Operating system: **{os_info}**
+
+ Available VRAM:
+ \t {', '.join(vram_info) if vram_info else '**N/A**'}
+"""
+### End Device info ###
+
+class App:
+ def __init__(self, args):
+ self.args = args
+ #self.app = gr.Blocks(css=CSS, theme=self.args.theme, delete_cache=(60, 3600))
+ self.app = gr.Blocks(css=CSS, theme=gr.themes.Ocean(), delete_cache=(60, 3600))
+ self.whisper_inf = WhisperFactory.create_whisper_inference(
+ whisper_type=self.args.whisper_type,
+ whisper_model_dir=self.args.whisper_model_dir,
+ faster_whisper_model_dir=self.args.faster_whisper_model_dir,
+ insanely_fast_whisper_model_dir=self.args.insanely_fast_whisper_model_dir,
+ uvr_model_dir=self.args.uvr_model_dir,
+ output_dir=self.args.output_dir,
+ )
+ self.nllb_inf = NLLBInference(
+ model_dir=self.args.nllb_model_dir,
+ output_dir=os.path.join(self.args.output_dir, "translations")
+ )
+ self.deepl_api = DeepLAPI(
+ output_dir=os.path.join(self.args.output_dir, "translations")
+ )
+ self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
+ print(f"Use \"{self.args.whisper_type}\" implementation")
+ print(f"Device \"{self.whisper_inf.device}\" is detected")
+
+ def create_whisper_parameters(self):
+
+ whisper_params = self.default_params["whisper"]
+ diarization_params = self.default_params["diarization"]
+ vad_params = self.default_params["vad"]
+ uvr_params = self.default_params["bgm_separation"]
+
+ with gr.Row():
+ dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],label="Model")
+ dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,value=whisper_params["lang"], label="Language")
+ #dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
+ dd_file_format = gr.Dropdown(choices=["SRT", "txt"], value="SRT", label="Output format")
+
+ with gr.Row():
+ cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"], label="Add timestamp to output file",interactive=True)
+ cb_diarize = gr.Checkbox(label="Speaker diarization", value=diarization_params["is_diarize"])
+ cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label="Translate to English",interactive=True)
+
+ with gr.Accordion("Diarization options", open=False):
+ tb_hf_token = gr.Text(label="HuggingFace Token", value=diarization_params["hf_token"],
+ info="This is only needed the first time you download the model. If you already have"
+ " models, you don't need to enter. To download the model, you must manually go "
+ "to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to"
+ " their requirement.")
+ dd_diarization_device = gr.Dropdown(label="Device",
+ choices=self.whisper_inf.diarizer.get_available_device(),
+ value=self.whisper_inf.diarizer.get_device())
+
+ with gr.Accordion("Advanced options", open=False):
+ nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0, interactive=True,
+ info="Beam size to use for decoding.")
+ nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=whisper_params["log_prob_threshold"], interactive=True,
+ info="If the average log probability over sampled tokens is below this value, treat as failed.")
+ nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=whisper_params["no_speech_threshold"], interactive=True,
+ info="If the no speech probability is higher than this value AND the average log probability over sampled tokens is below 'Log Prob Threshold', consider the segment as silent.")
+ dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types,
+ value=self.whisper_inf.current_compute_type, interactive=True,
+ allow_custom_value=True,
+ info="Select the type of computation to perform.")
+ nb_best_of = gr.Number(label="Best Of", value=whisper_params["best_of"], interactive=True,
+ info="Number of candidates when sampling with non-zero temperature.")
+ nb_patience = gr.Number(label="Patience", value=whisper_params["patience"], interactive=True,
+ info="Beam search patience factor.")
+ cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=whisper_params["condition_on_previous_text"],
+ interactive=True,
+ info="Condition on previous text during decoding.")
+ sld_prompt_reset_on_temperature = gr.Slider(label="Prompt Reset On Temperature", value=whisper_params["prompt_reset_on_temperature"],
+ minimum=0, maximum=1, step=0.01, interactive=True,
+ info="Resets prompt if temperature is above this value."
+ " Arg has effect only if 'Condition On Previous Text' is True.")
+ tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True,
+ info="Initial prompt to use for decoding.")
+ sd_temperature = gr.Slider(label="Temperature", value=whisper_params["temperature"], minimum=0.0,
+ step=0.01, maximum=1.0, interactive=True,
+ info="Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `Compression Ratio Threshold` or `Log Prob Threshold`.")
+ nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=whisper_params["compression_ratio_threshold"],
+ interactive=True,
+ info="If the gzip compression ratio is above this value, treat as failed.")
+ nb_chunk_length = gr.Number(label="Chunk Length (s)", value=lambda: whisper_params["chunk_length"],
+ precision=0,
+ info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
+ with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
+ nb_length_penalty = gr.Number(label="Length Penalty", value=whisper_params["length_penalty"],
+ info="Exponential length penalty constant.")
+ nb_repetition_penalty = gr.Number(label="Repetition Penalty", value=whisper_params["repetition_penalty"],
+ info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
+ nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", value=whisper_params["no_repeat_ngram_size"],
+ precision=0,
+ info="Prevent repetitions of n-grams with this size (set 0 to disable).")
+ tb_prefix = gr.Textbox(label="Prefix", value=lambda: whisper_params["prefix"],
+ info="Optional text to provide as a prefix for the first window.")
+ cb_suppress_blank = gr.Checkbox(label="Suppress Blank", value=whisper_params["suppress_blank"],
+ info="Suppress blank outputs at the beginning of the sampling.")
+ tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value=whisper_params["suppress_tokens"],
+ info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
+ nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", value=whisper_params["max_initial_timestamp"],
+ info="The initial timestamp cannot be later than this.")
+ cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=whisper_params["word_timestamps"],
+ info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
+ tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", value=whisper_params["prepend_punctuations"],
+ info="If 'Word Timestamps' is True, merge these punctuation symbols with the next word.")
+ tb_append_punctuations = gr.Textbox(label="Append Punctuations", value=whisper_params["append_punctuations"],
+ info="If 'Word Timestamps' is True, merge these punctuation symbols with the previous word.")
+ nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: whisper_params["max_new_tokens"],
+ precision=0,
+ info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
+ nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
+ value=lambda: whisper_params["hallucination_silence_threshold"],
+ info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
+ tb_hotwords = gr.Textbox(label="Hotwords", value=lambda: whisper_params["hotwords"],
+ info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
+ nb_language_detection_threshold = gr.Number(label="Language Detection Threshold", value=lambda: whisper_params["language_detection_threshold"],
+ info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
+ nb_language_detection_segments = gr.Number(label="Language Detection Segments", value=lambda: whisper_params["language_detection_segments"],
+ precision=0,
+ info="Number of segments to consider for the language detection.")
+ with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
+ nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
+
+ with gr.Accordion("Background Music Remover Filter", open=False):
+ cb_bgm_separation = gr.Checkbox(label="Enable Background Music Remover Filter", value=uvr_params["is_separate_bgm"],
+ interactive=True,
+ info="Enabling this will remove background music by submodel before"
+ " transcribing ")
+ dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
+ choices=self.whisper_inf.music_separator.available_devices)
+ dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
+ choices=self.whisper_inf.music_separator.available_models)
+ nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
+ cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"])
+ cb_uvr_enable_offload = gr.Checkbox(label="Offload sub model after removing background music",
+ value=uvr_params["enable_offload"])
+
+ with gr.Accordion("Voice Detection Filter", open=False):
+ cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
+ interactive=True,
+ info="Enable this to transcribe only detected voice parts by submodel.")
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
+ value=vad_params["threshold"],
+ info="Lower it to be more sensitive to small sounds.")
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
+ value=vad_params["min_speech_duration_ms"],
+ info="Final speech chunks shorter than this time are thrown out")
+ nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)",
+ value=vad_params["max_speech_duration_s"],
+ info="Maximum duration of speech chunks in \"seconds\".")
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
+ value=vad_params["min_silence_duration_ms"],
+ info="In the end of each speech chunk wait for this time"
+ " before separating it")
+ nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
+ info="Final speech chunks are padded by this time each side")
+
+ dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
+
+ return (
+ WhisperParameters(
+ model_size=dd_model, lang=dd_lang, is_translate=cb_translate, beam_size=nb_beam_size,
+ log_prob_threshold=nb_log_prob_threshold, no_speech_threshold=nb_no_speech_threshold,
+ compute_type=dd_compute_type, best_of=nb_best_of, patience=nb_patience,
+ condition_on_previous_text=cb_condition_on_previous_text, initial_prompt=tb_initial_prompt,
+ temperature=sd_temperature, compression_ratio_threshold=nb_compression_ratio_threshold,
+ vad_filter=cb_vad_filter, threshold=sd_threshold, min_speech_duration_ms=nb_min_speech_duration_ms,
+ max_speech_duration_s=nb_max_speech_duration_s, min_silence_duration_ms=nb_min_silence_duration_ms,
+ speech_pad_ms=nb_speech_pad_ms, chunk_length=nb_chunk_length, batch_size=nb_batch_size,
+ is_diarize=cb_diarize, hf_token=tb_hf_token, diarization_device=dd_diarization_device,
+ length_penalty=nb_length_penalty, repetition_penalty=nb_repetition_penalty,
+ no_repeat_ngram_size=nb_no_repeat_ngram_size, prefix=tb_prefix, suppress_blank=cb_suppress_blank,
+ suppress_tokens=tb_suppress_tokens, max_initial_timestamp=nb_max_initial_timestamp,
+ word_timestamps=cb_word_timestamps, prepend_punctuations=tb_prepend_punctuations,
+ append_punctuations=tb_append_punctuations, max_new_tokens=nb_max_new_tokens,
+ hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
+ language_detection_threshold=nb_language_detection_threshold,
+ language_detection_segments=nb_language_detection_segments,
+ prompt_reset_on_temperature=sld_prompt_reset_on_temperature, is_bgm_separate=cb_bgm_separation,
+ uvr_device=dd_uvr_device, uvr_model_size=dd_uvr_model_size, uvr_segment_size=nb_uvr_segment_size,
+ uvr_save_file=cb_uvr_save_file, uvr_enable_offload=cb_uvr_enable_offload
+ ),
+ dd_file_format,
+ cb_timestamp
+ )
+
+ def launch(self):
+ translation_params = self.default_params["translation"]
+ deepl_params = translation_params["deepl"]
+ nllb_params = translation_params["nllb"]
+ uvr_params = self.default_params["bgm_separation"]
+
+ with self.app:
+ with gr.Row():
+ with gr.Column():
+ gr.Markdown(MARKDOWN, elem_id="md_project")
+ with gr.Tabs():
+ with gr.TabItem("Audio"): # tab1
+ with gr.Column():
+ #input_file = gr.Files(type="filepath", label="Upload File here")
+ input_file = gr.Audio(type='filepath', elem_id="audio_input")
+ tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)",
+ info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them."
+ " Leave this field empty if you do not wish to use a local path.",
+ visible=self.args.colab,
+ value="")
+
+ whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
+
+ with gr.Row():
+ btn_run = gr.Button("Transcribe", variant="primary")
+ btn_reset = gr.Button(value="Reset")
+ btn_reset.click(None,js="window.location.reload()")
+ with gr.Row():
+ with gr.Column(scale=3):
+ tb_indicator = gr.Textbox(label="Output result")
+ with gr.Column(scale=1):
+ tb_info = gr.Textbox(label="Output info", interactive=False, scale=3)
+ files_subtitles = gr.Files(label="Output file", interactive=False, scale=2)
+ # btn_openfolder = gr.Button('π', scale=1)
+
+ params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
+ btn_run.click(fn=self.whisper_inf.transcribe_file,
+ inputs=params + whisper_params.as_list(),
+ outputs=[tb_indicator, files_subtitles, tb_info])
+ # btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
+
+ with gr.TabItem("Device info"): # tab2
+ with gr.Column():
+ gr.Markdown(device_info, label="Hardware info & installed packages")
+
+ # Launch the app with optional gradio settings
+ args = self.args
+
+ self.app.queue(
+ api_open=args.api_open
+ ).launch(
+ share=args.share,
+ server_name=args.server_name,
+ server_port=args.server_port,
+ auth=(args.username, args.password) if args.username and args.password else None,
+ root_path=args.root_path,
+ inbrowser=args.inbrowser
+ )
+
+ @staticmethod
+ def open_folder(folder_path: str):
+ if os.path.exists(folder_path):
+ os.system(f"start {folder_path}")
+ else:
+ os.makedirs(folder_path, exist_ok=True)
+ print(f"The directory path {folder_path} has newly created.")
+
+ @staticmethod
+ def on_change_models(model_size: str):
+ translatable_model = ["large", "large-v1", "large-v2", "large-v3"]
+ if model_size not in translatable_model:
+ return gr.Checkbox(visible=False, value=False, interactive=False)
+ #return gr.Checkbox(visible=True, value=False, label="Translate to English (large models only)", interactive=False)
+ else:
+ return gr.Checkbox(visible=True, value=False, label="Translate to English", interactive=True)
+
+
+# Create the parser for command-line arguments
+parser = argparse.ArgumentParser()
+parser.add_argument('--whisper_type', type=str, default="faster-whisper",
+ help='A type of the whisper implementation between: ["whisper", "faster-whisper", "insanely-fast-whisper"]')
+parser.add_argument('--share', type=str2bool, default=False, nargs='?', const=True, help='Gradio share value')
+parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
+parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
+parser.add_argument('--root_path', type=str, default=None, help='Gradio root path')
+parser.add_argument('--username', type=str, default=None, help='Gradio authentication username')
+parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
+parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
+parser.add_argument('--colab', type=str2bool, default=False, nargs='?', const=True, help='Is colab user or not')
+parser.add_argument('--api_open', type=str2bool, default=False, nargs='?', const=True, help='Enable api or not in Gradio')
+parser.add_argument('--inbrowser', type=str2bool, default=True, nargs='?', const=True, help='Whether to automatically start Gradio app or not')
+parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
+ help='Directory path of the whisper model')
+parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,
+ help='Directory path of the faster-whisper model')
+parser.add_argument('--insanely_fast_whisper_model_dir', type=str,
+ default=INSANELY_FAST_WHISPER_MODELS_DIR,
+ help='Directory path of the insanely-fast-whisper model')
+parser.add_argument('--diarization_model_dir', type=str, default=DIARIZATION_MODELS_DIR,
+ help='Directory path of the diarization model')
+parser.add_argument('--nllb_model_dir', type=str, default=NLLB_MODELS_DIR,
+ help='Directory path of the Facebook NLLB model')
+parser.add_argument('--uvr_model_dir', type=str, default=UVR_MODELS_DIR,
+ help='Directory path of the UVR model')
+parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Directory path of the outputs')
+_args = parser.parse_args()
+
+if __name__ == "__main__":
+ app = App(args=_args)
+ app.launch()
diff --git a/configs/default_parameters.yaml b/configs/default_parameters.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8eace9295fd6f6ccece85ecfe53573e9b70367f9
--- /dev/null
+++ b/configs/default_parameters.yaml
@@ -0,0 +1,64 @@
+whisper:
+ model_size: "large-v3"
+ lang: "Automatic Detection"
+ is_translate: false
+ beam_size: 5
+ log_prob_threshold: -1
+ no_speech_threshold: 0.6
+ best_of: 5
+ patience: 1
+ condition_on_previous_text: true
+ prompt_reset_on_temperature: 0.5
+ initial_prompt: null
+ temperature: 0
+ compression_ratio_threshold: 2.4
+ chunk_length: 30
+ batch_size: 24
+ length_penalty: 1
+ repetition_penalty: 1
+ no_repeat_ngram_size: 0
+ prefix: null
+ suppress_blank: true
+ suppress_tokens: "[-1]"
+ max_initial_timestamp: 1
+ word_timestamps: false
+ prepend_punctuations: "\"'βΒΏ([{-"
+ append_punctuations: "\"'.γ,οΌ!οΌ?οΌ:οΌβ)]}γ"
+ max_new_tokens: null
+ hallucination_silence_threshold: null
+ hotwords: null
+ language_detection_threshold: null
+ language_detection_segments: 1
+ add_timestamp: false
+
+vad:
+ vad_filter: false
+ threshold: 0.5
+ min_speech_duration_ms: 250
+ max_speech_duration_s: 9999
+ min_silence_duration_ms: 1000
+ speech_pad_ms: 2000
+
+diarization:
+ is_diarize: false
+ hf_token: ""
+
+bgm_separation:
+ is_separate_bgm: false
+ model_size: "UVR-MDX-NET-Inst_HQ_4"
+ segment_size: 256
+ save_file: false
+ enable_offload: true
+
+translation:
+ deepl:
+ api_key: ""
+ is_pro: false
+ source_lang: "Automatic Detection"
+ target_lang: "English"
+ nllb:
+ model_size: "facebook/nllb-200-1.3B"
+ source_lang: null
+ target_lang: null
+ max_length: 200
+ add_timestamp: true
diff --git a/demo/audio.wav b/demo/audio.wav
new file mode 100644
index 0000000000000000000000000000000000000000..34a90504fb551b558e93ca3a899221ceda3b2686
Binary files /dev/null and b/demo/audio.wav differ
diff --git a/docker-compose.yaml b/docker-compose.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e0ccfbc3a817ee5ec2a9b747070f8a49f5aef055
--- /dev/null
+++ b/docker-compose.yaml
@@ -0,0 +1,29 @@
+services:
+ app:
+ build: .
+ image: whisper-webui:latest
+
+ volumes:
+ # Update paths to mount models and output paths to your custom paths like this, e.g:
+ # - C:/whisper-models/custom-path:/Whisper-WebUI/models
+ # - C:/whisper-webui-outputs/custom-path:/Whisper-WebUI/outputs
+ - /Whisper-WebUI/models
+ - /Whisper-WebUI/outputs
+
+ ports:
+ - "7860:7860"
+
+ stdin_open: true
+ tty: true
+
+ entrypoint: ["python", "app.py", "--server_port", "7860", "--server_name", "0.0.0.0",]
+
+ # If you're not using nvidia GPU, Update device to match yours.
+ # See more info at : https://docs.docker.com/compose/compose-file/deploy/#driver
+ deploy:
+ resources:
+ reservations:
+ devices:
+ - driver: nvidia
+ count: all
+ capabilities: [ gpu ]
diff --git a/models/models will be saved here.txt b/models/models will be saved here.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/modules/__init__.py b/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/modules/diarize/__init__.py b/modules/diarize/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/modules/diarize/audio_loader.py b/modules/diarize/audio_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..d90e52c3eea45c1e737e789cde9bcc637c46da90
--- /dev/null
+++ b/modules/diarize/audio_loader.py
@@ -0,0 +1,179 @@
+# Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/audio.py
+
+import os
+import subprocess
+from functools import lru_cache
+from typing import Optional, Union
+from scipy.io.wavfile import write
+import tempfile
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+def exact_div(x, y):
+ assert x % y == 0
+ return x // y
+
+# hard-coded audio hyperparameters
+SAMPLE_RATE = 16000
+N_FFT = 400
+HOP_LENGTH = 160
+CHUNK_LENGTH = 30
+N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000 samples in a 30-second chunk
+N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH) # 3000 frames in a mel spectrogram input
+
+N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2 # the initial convolutions has stride 2
+FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH) # 10ms per audio frame
+TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN) # 20ms per audio token
+
+
+def load_audio(file: Union[str, np.ndarray], sr: int = SAMPLE_RATE) -> np.ndarray:
+ """
+ Open an audio file or process a numpy array containing audio data as mono waveform, resampling as necessary.
+
+ Parameters
+ ----------
+ file: Union[str, np.ndarray]
+ The audio file to open or a numpy array containing the audio data.
+
+ sr: int
+ The sample rate to resample the audio if necessary.
+
+ Returns
+ -------
+ A NumPy array containing the audio waveform, in float32 dtype.
+ """
+ if isinstance(file, np.ndarray):
+ if file.dtype != np.float32:
+ file = file.astype(np.float32)
+ if file.ndim > 1:
+ file = np.mean(file, axis=1)
+
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+ write(temp_file.name, SAMPLE_RATE, (file * 32768).astype(np.int16))
+ temp_file_path = temp_file.name
+ temp_file.close()
+ else:
+ temp_file_path = file
+
+ try:
+ cmd = [
+ "ffmpeg",
+ "-nostdin",
+ "-threads",
+ "0",
+ "-i",
+ temp_file_path,
+ "-f",
+ "s16le",
+ "-ac",
+ "1",
+ "-acodec",
+ "pcm_s16le",
+ "-ar",
+ str(sr),
+ "-",
+ ]
+ out = subprocess.run(cmd, capture_output=True, check=True).stdout
+ except subprocess.CalledProcessError as e:
+ raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+ finally:
+ if isinstance(file, np.ndarray):
+ os.remove(temp_file_path)
+
+ return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+
+
+def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
+ """
+ Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
+ """
+ if torch.is_tensor(array):
+ if array.shape[axis] > length:
+ array = array.index_select(
+ dim=axis, index=torch.arange(length, device=array.device)
+ )
+
+ if array.shape[axis] < length:
+ pad_widths = [(0, 0)] * array.ndim
+ pad_widths[axis] = (0, length - array.shape[axis])
+ array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
+ else:
+ if array.shape[axis] > length:
+ array = array.take(indices=range(length), axis=axis)
+
+ if array.shape[axis] < length:
+ pad_widths = [(0, 0)] * array.ndim
+ pad_widths[axis] = (0, length - array.shape[axis])
+ array = np.pad(array, pad_widths)
+
+ return array
+
+
+@lru_cache(maxsize=None)
+def mel_filters(device, n_mels: int) -> torch.Tensor:
+ """
+ load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
+ Allows decoupling librosa dependency; saved using:
+
+ np.savez_compressed(
+ "mel_filters.npz",
+ mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
+ )
+ """
+ assert n_mels in [80, 128], f"Unsupported n_mels: {n_mels}"
+ with np.load(
+ os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz")
+ ) as f:
+ return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
+
+
+def log_mel_spectrogram(
+ audio: Union[str, np.ndarray, torch.Tensor],
+ n_mels: int,
+ padding: int = 0,
+ device: Optional[Union[str, torch.device]] = None,
+):
+ """
+ Compute the log-Mel spectrogram of
+
+ Parameters
+ ----------
+ audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+ The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
+
+ n_mels: int
+ The number of Mel-frequency filters, only 80 is supported
+
+ padding: int
+ Number of zero samples to pad to the right
+
+ device: Optional[Union[str, torch.device]]
+ If given, the audio tensor is moved to this device before STFT
+
+ Returns
+ -------
+ torch.Tensor, shape = (80, n_frames)
+ A Tensor that contains the Mel spectrogram
+ """
+ if not torch.is_tensor(audio):
+ if isinstance(audio, str):
+ audio = load_audio(audio)
+ audio = torch.from_numpy(audio)
+
+ if device is not None:
+ audio = audio.to(device)
+ if padding > 0:
+ audio = F.pad(audio, (0, padding))
+ window = torch.hann_window(N_FFT).to(audio.device)
+ stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
+ magnitudes = stft[..., :-1].abs() ** 2
+
+ filters = mel_filters(audio.device, n_mels)
+ mel_spec = filters @ magnitudes
+
+ log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+ log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+ log_spec = (log_spec + 4.0) / 4.0
+ return log_spec
\ No newline at end of file
diff --git a/modules/diarize/diarize_pipeline.py b/modules/diarize/diarize_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4109e8474a37a35f423cd1f0daa0ddc22ab1bef
--- /dev/null
+++ b/modules/diarize/diarize_pipeline.py
@@ -0,0 +1,95 @@
+# Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/diarize.py
+
+import numpy as np
+import pandas as pd
+import os
+from pyannote.audio import Pipeline
+from typing import Optional, Union
+import torch
+
+from modules.utils.paths import DIARIZATION_MODELS_DIR
+from modules.diarize.audio_loader import load_audio, SAMPLE_RATE
+
+
+class DiarizationPipeline:
+ def __init__(
+ self,
+ model_name="pyannote/speaker-diarization-3.1",
+ cache_dir: str = DIARIZATION_MODELS_DIR,
+ use_auth_token=None,
+ device: Optional[Union[str, torch.device]] = "cpu",
+ ):
+ if isinstance(device, str):
+ device = torch.device(device)
+ self.model = Pipeline.from_pretrained(
+ model_name,
+ use_auth_token=use_auth_token,
+ cache_dir=cache_dir
+ ).to(device)
+
+ def __call__(self, audio: Union[str, np.ndarray], min_speakers=None, max_speakers=None):
+ if isinstance(audio, str):
+ audio = load_audio(audio)
+ audio_data = {
+ 'waveform': torch.from_numpy(audio[None, :]),
+ 'sample_rate': SAMPLE_RATE
+ }
+ segments = self.model(audio_data, min_speakers=min_speakers, max_speakers=max_speakers)
+ diarize_df = pd.DataFrame(segments.itertracks(yield_label=True), columns=['segment', 'label', 'speaker'])
+ diarize_df['start'] = diarize_df['segment'].apply(lambda x: x.start)
+ diarize_df['end'] = diarize_df['segment'].apply(lambda x: x.end)
+ return diarize_df
+
+
+def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
+ transcript_segments = transcript_result["segments"]
+ for seg in transcript_segments:
+ # assign speaker to segment (if any)
+ diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'],
+ seg['start'])
+ diarize_df['union'] = np.maximum(diarize_df['end'], seg['end']) - np.minimum(diarize_df['start'], seg['start'])
+
+ intersected = diarize_df[diarize_df["intersection"] > 0]
+
+ speaker = None
+ if len(intersected) > 0:
+ # Choosing most strong intersection
+ speaker = intersected.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
+ elif fill_nearest:
+ # Otherwise choosing closest
+ speaker = diarize_df.sort_values(by=["intersection"], ascending=False)["speaker"].values[0]
+
+ if speaker is not None:
+ seg["speaker"] = speaker
+
+ # assign speaker to words
+ if 'words' in seg:
+ for word in seg['words']:
+ if 'start' in word:
+ diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(
+ diarize_df['start'], word['start'])
+ diarize_df['union'] = np.maximum(diarize_df['end'], word['end']) - np.minimum(diarize_df['start'],
+ word['start'])
+
+ intersected = diarize_df[diarize_df["intersection"] > 0]
+
+ word_speaker = None
+ if len(intersected) > 0:
+ # Choosing most strong intersection
+ word_speaker = \
+ intersected.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
+ elif fill_nearest:
+ # Otherwise choosing closest
+ word_speaker = diarize_df.sort_values(by=["intersection"], ascending=False)["speaker"].values[0]
+
+ if word_speaker is not None:
+ word["speaker"] = word_speaker
+
+ return transcript_result
+
+
+class Segment:
+ def __init__(self, start, end, speaker=None):
+ self.start = start
+ self.end = end
+ self.speaker = speaker
diff --git a/modules/diarize/diarizer.py b/modules/diarize/diarizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e24adc75f2b65ae99976423424466af194f55552
--- /dev/null
+++ b/modules/diarize/diarizer.py
@@ -0,0 +1,133 @@
+import os
+import torch
+from typing import List, Union, BinaryIO, Optional
+import numpy as np
+import time
+import logging
+
+from modules.utils.paths import DIARIZATION_MODELS_DIR
+from modules.diarize.diarize_pipeline import DiarizationPipeline, assign_word_speakers
+from modules.diarize.audio_loader import load_audio
+
+
+class Diarizer:
+ def __init__(self,
+ model_dir: str = DIARIZATION_MODELS_DIR
+ ):
+ self.device = self.get_device()
+ self.available_device = self.get_available_device()
+ self.compute_type = "float16"
+ self.model_dir = model_dir
+ os.makedirs(self.model_dir, exist_ok=True)
+ self.pipe = None
+
+ def run(self,
+ audio: Union[str, BinaryIO, np.ndarray],
+ transcribed_result: List[dict],
+ use_auth_token: str,
+ device: Optional[str] = None
+ ):
+ """
+ Diarize transcribed result as a post-processing
+
+ Parameters
+ ----------
+ audio: Union[str, BinaryIO, np.ndarray]
+ Audio input. This can be file path or binary type.
+ transcribed_result: List[dict]
+ transcribed result through whisper.
+ use_auth_token: str
+ Huggingface token with READ permission. This is only needed the first time you download the model.
+ You must manually go to the website https://huggingface.co/pyannote/speaker-diarization-3.1 and agree to their TOS to download the model.
+ device: Optional[str]
+ Device for diarization.
+
+ Returns
+ ----------
+ segments_result: List[dict]
+ list of dicts that includes start, end timestamps and transcribed text
+ elapsed_time: float
+ elapsed time for running
+ """
+ start_time = time.time()
+
+ if device is None:
+ device = self.device
+
+ if device != self.device or self.pipe is None:
+ self.update_pipe(
+ device=device,
+ use_auth_token=use_auth_token
+ )
+
+ audio = load_audio(audio)
+
+ diarization_segments = self.pipe(audio)
+ diarized_result = assign_word_speakers(
+ diarization_segments,
+ {"segments": transcribed_result}
+ )
+
+ for segment in diarized_result["segments"]:
+ speaker = "None"
+ if "speaker" in segment:
+ speaker = segment["speaker"]
+ segment["text"] = speaker + ": " + segment["text"].strip()
+
+ elapsed_time = time.time() - start_time
+ return diarized_result["segments"], elapsed_time
+
+ def update_pipe(self,
+ use_auth_token: str,
+ device: str
+ ):
+ """
+ Set pipeline for diarization
+
+ Parameters
+ ----------
+ use_auth_token: str
+ Huggingface token with READ permission. This is only needed the first time you download the model.
+ You must manually go to the website https://huggingface.co/pyannote/speaker-diarization-3.1 and agree to their TOS to download the model.
+ device: str
+ Device for diarization.
+ """
+ self.device = device
+
+ os.makedirs(self.model_dir, exist_ok=True)
+
+ if (not os.listdir(self.model_dir) and
+ not use_auth_token):
+ print(
+ "\nFailed to diarize. You need huggingface token and agree to their requirements to download the diarization model.\n"
+ "Go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and follow their instructions to download the model.\n"
+ )
+ return
+
+ logger = logging.getLogger("speechbrain.utils.train_logger")
+ # Disable redundant torchvision warning message
+ logger.disabled = True
+ self.pipe = DiarizationPipeline(
+ use_auth_token=use_auth_token,
+ device=device,
+ cache_dir=self.model_dir
+ )
+ logger.disabled = False
+
+ @staticmethod
+ def get_device():
+ if torch.cuda.is_available():
+ return "cuda"
+ elif torch.backends.mps.is_available():
+ return "mps"
+ else:
+ return "cpu"
+
+ @staticmethod
+ def get_available_device():
+ devices = ["cpu"]
+ if torch.cuda.is_available():
+ devices.append("cuda")
+ elif torch.backends.mps.is_available():
+ devices.append("mps")
+ return devices
\ No newline at end of file
diff --git a/modules/translation/__init__.py b/modules/translation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/modules/translation/deepl_api.py b/modules/translation/deepl_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..385b3a14bfa201021d45d6a0ecf1dc83c1c955f6
--- /dev/null
+++ b/modules/translation/deepl_api.py
@@ -0,0 +1,226 @@
+import requests
+import time
+import os
+from datetime import datetime
+import gradio as gr
+
+from modules.utils.paths import TRANSLATION_OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH
+from modules.utils.subtitle_manager import *
+from modules.utils.files_manager import load_yaml, save_yaml
+
+"""
+This is written with reference to the DeepL API documentation.
+If you want to know the information of the DeepL API, see here: https://www.deepl.com/docs-api/documents
+"""
+
+DEEPL_AVAILABLE_TARGET_LANGS = {
+ 'Bulgarian': 'BG',
+ 'Czech': 'CS',
+ 'Danish': 'DA',
+ 'German': 'DE',
+ 'Greek': 'EL',
+ 'English': 'EN',
+ 'English (British)': 'EN-GB',
+ 'English (American)': 'EN-US',
+ 'Spanish': 'ES',
+ 'Estonian': 'ET',
+ 'Finnish': 'FI',
+ 'French': 'FR',
+ 'Hungarian': 'HU',
+ 'Indonesian': 'ID',
+ 'Italian': 'IT',
+ 'Japanese': 'JA',
+ 'Korean': 'KO',
+ 'Lithuanian': 'LT',
+ 'Latvian': 'LV',
+ 'Norwegian (BokmΓ₯l)': 'NB',
+ 'Dutch': 'NL',
+ 'Polish': 'PL',
+ 'Portuguese': 'PT',
+ 'Portuguese (Brazilian)': 'PT-BR',
+ 'Portuguese (all Portuguese varieties excluding Brazilian Portuguese)': 'PT-PT',
+ 'Romanian': 'RO',
+ 'Russian': 'RU',
+ 'Slovak': 'SK',
+ 'Slovenian': 'SL',
+ 'Swedish': 'SV',
+ 'Turkish': 'TR',
+ 'Ukrainian': 'UK',
+ 'Chinese (simplified)': 'ZH'
+}
+
+DEEPL_AVAILABLE_SOURCE_LANGS = {
+ 'Automatic Detection': None,
+ 'Bulgarian': 'BG',
+ 'Czech': 'CS',
+ 'Danish': 'DA',
+ 'German': 'DE',
+ 'Greek': 'EL',
+ 'English': 'EN',
+ 'Spanish': 'ES',
+ 'Estonian': 'ET',
+ 'Finnish': 'FI',
+ 'French': 'FR',
+ 'Hungarian': 'HU',
+ 'Indonesian': 'ID',
+ 'Italian': 'IT',
+ 'Japanese': 'JA',
+ 'Korean': 'KO',
+ 'Lithuanian': 'LT',
+ 'Latvian': 'LV',
+ 'Norwegian (BokmΓ₯l)': 'NB',
+ 'Dutch': 'NL',
+ 'Polish': 'PL',
+ 'Portuguese (all Portuguese varieties mixed)': 'PT',
+ 'Romanian': 'RO',
+ 'Russian': 'RU',
+ 'Slovak': 'SK',
+ 'Slovenian': 'SL',
+ 'Swedish': 'SV',
+ 'Turkish': 'TR',
+ 'Ukrainian': 'UK',
+ 'Chinese': 'ZH'
+}
+
+
+class DeepLAPI:
+ def __init__(self,
+ output_dir: str = TRANSLATION_OUTPUT_DIR
+ ):
+ self.api_interval = 1
+ self.max_text_batch_size = 50
+ self.available_target_langs = DEEPL_AVAILABLE_TARGET_LANGS
+ self.available_source_langs = DEEPL_AVAILABLE_SOURCE_LANGS
+ self.output_dir = output_dir
+
+ def translate_deepl(self,
+ auth_key: str,
+ fileobjs: list,
+ source_lang: str,
+ target_lang: str,
+ is_pro: bool = False,
+ add_timestamp: bool = True,
+ progress=gr.Progress()) -> list:
+ """
+ Translate subtitle files using DeepL API
+ Parameters
+ ----------
+ auth_key: str
+ API Key for DeepL from gr.Textbox()
+ fileobjs: list
+ List of files to transcribe from gr.Files()
+ source_lang: str
+ Source language of the file to transcribe from gr.Dropdown()
+ target_lang: str
+ Target language of the file to transcribe from gr.Dropdown()
+ is_pro: str
+ Boolean value that is about pro user or not from gr.Checkbox().
+ add_timestamp: bool
+ Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+ progress: gr.Progress
+ Indicator to show progress directly in gradio.
+
+ Returns
+ ----------
+ A List of
+ String to return to gr.Textbox()
+ Files to return to gr.Files()
+ """
+ if fileobjs and isinstance(fileobjs[0], gr.utils.NamedString):
+ fileobjs = [fileobj.name for fileobj in fileobjs]
+
+ self.cache_parameters(
+ api_key=auth_key,
+ is_pro=is_pro,
+ source_lang=source_lang,
+ target_lang=target_lang,
+ add_timestamp=add_timestamp
+ )
+
+ files_info = {}
+ for fileobj in fileobjs:
+ file_path = fileobj
+ file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
+
+ if file_ext == ".srt":
+ parsed_dicts = parse_srt(file_path=file_path)
+
+ elif file_ext == ".vtt":
+ parsed_dicts = parse_vtt(file_path=file_path)
+
+ batch_size = self.max_text_batch_size
+ for batch_start in range(0, len(parsed_dicts), batch_size):
+ batch_end = min(batch_start + batch_size, len(parsed_dicts))
+ sentences_to_translate = [dic["sentence"] for dic in parsed_dicts[batch_start:batch_end]]
+ translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
+ target_lang, is_pro)
+ for i, translated_text in enumerate(translated_texts):
+ parsed_dicts[batch_start + i]["sentence"] = translated_text["text"]
+ progress(batch_end / len(parsed_dicts), desc="Translating..")
+
+ if file_ext == ".srt":
+ subtitle = get_serialized_srt(parsed_dicts)
+ elif file_ext == ".vtt":
+ subtitle = get_serialized_vtt(parsed_dicts)
+
+ if add_timestamp:
+ timestamp = datetime.now().strftime("%m%d%H%M%S")
+ file_name += f"-{timestamp}"
+
+ output_path = os.path.join(self.output_dir, f"{file_name}{file_ext}")
+ write_file(subtitle, output_path)
+
+ files_info[file_name] = {"subtitle": subtitle, "path": output_path}
+
+ total_result = ''
+ for file_name, info in files_info.items():
+ total_result += '------------------------------------\n'
+ total_result += f'{file_name}\n\n'
+ total_result += f'{info["subtitle"]}'
+ gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
+
+ output_file_paths = [item["path"] for key, item in files_info.items()]
+ return [gr_str, output_file_paths]
+
+ def request_deepl_translate(self,
+ auth_key: str,
+ text: list,
+ source_lang: str,
+ target_lang: str,
+ is_pro: bool = False):
+ """Request API response to DeepL server"""
+ if source_lang not in list(DEEPL_AVAILABLE_SOURCE_LANGS.keys()):
+ raise ValueError(f"Source language {source_lang} is not supported."
+ f"Use one of {list(DEEPL_AVAILABLE_SOURCE_LANGS.keys())}")
+ if target_lang not in list(DEEPL_AVAILABLE_TARGET_LANGS.keys()):
+ raise ValueError(f"Target language {target_lang} is not supported."
+ f"Use one of {list(DEEPL_AVAILABLE_TARGET_LANGS.keys())}")
+
+ url = 'https://api.deepl.com/v2/translate' if is_pro else 'https://api-free.deepl.com/v2/translate'
+ headers = {
+ 'Authorization': f'DeepL-Auth-Key {auth_key}'
+ }
+ data = {
+ 'text': text,
+ 'source_lang': DEEPL_AVAILABLE_SOURCE_LANGS[source_lang],
+ 'target_lang': DEEPL_AVAILABLE_TARGET_LANGS[target_lang]
+ }
+ response = requests.post(url, headers=headers, data=data).json()
+ time.sleep(self.api_interval)
+ return response["translations"]
+
+ @staticmethod
+ def cache_parameters(api_key: str,
+ is_pro: bool,
+ source_lang: str,
+ target_lang: str,
+ add_timestamp: bool):
+ cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
+ cached_params["translation"]["deepl"] = {
+ "api_key": api_key,
+ "is_pro": is_pro,
+ "source_lang": source_lang,
+ "target_lang": target_lang
+ }
+ cached_params["translation"]["add_timestamp"] = add_timestamp
+ save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)
diff --git a/modules/translation/nllb_inference.py b/modules/translation/nllb_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..7987bb61e5702e3e66ec9f9c7095d6980e2fbcd3
--- /dev/null
+++ b/modules/translation/nllb_inference.py
@@ -0,0 +1,287 @@
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+import gradio as gr
+import os
+
+from modules.utils.paths import TRANSLATION_OUTPUT_DIR, NLLB_MODELS_DIR
+from modules.translation.translation_base import TranslationBase
+
+
+class NLLBInference(TranslationBase):
+ def __init__(self,
+ model_dir: str = NLLB_MODELS_DIR,
+ output_dir: str = TRANSLATION_OUTPUT_DIR
+ ):
+ super().__init__(
+ model_dir=model_dir,
+ output_dir=output_dir
+ )
+ self.tokenizer = None
+ self.available_models = ["facebook/nllb-200-3.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-distilled-600M"]
+ self.available_source_langs = list(NLLB_AVAILABLE_LANGS.keys())
+ self.available_target_langs = list(NLLB_AVAILABLE_LANGS.keys())
+ self.pipeline = None
+
+ def translate(self,
+ text: str,
+ max_length: int
+ ):
+ result = self.pipeline(
+ text,
+ max_length=max_length
+ )
+ return result[0]['translation_text']
+
+ def update_model(self,
+ model_size: str,
+ src_lang: str,
+ tgt_lang: str,
+ progress: gr.Progress = gr.Progress()
+ ):
+ def validate_language(lang: str) -> str:
+ if lang in NLLB_AVAILABLE_LANGS:
+ return NLLB_AVAILABLE_LANGS[lang]
+ elif lang not in NLLB_AVAILABLE_LANGS.values():
+ raise ValueError(
+ f"Language '{lang}' is not supported. Use one of: {list(NLLB_AVAILABLE_LANGS.keys())}")
+ return lang
+
+ src_lang = validate_language(src_lang)
+ tgt_lang = validate_language(tgt_lang)
+
+ if model_size != self.current_model_size or self.model is None:
+ print("\nInitializing NLLB Model..\n")
+ progress(0, desc="Initializing NLLB Model..")
+ self.current_model_size = model_size
+ local_files_only = self.is_model_exists(self.current_model_size)
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path=model_size,
+ cache_dir=self.model_dir,
+ local_files_only=local_files_only)
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_size,
+ cache_dir=os.path.join(self.model_dir, "tokenizers"),
+ local_files_only=local_files_only)
+
+ self.pipeline = pipeline("translation",
+ model=self.model,
+ tokenizer=self.tokenizer,
+ src_lang=src_lang,
+ tgt_lang=tgt_lang,
+ device=self.device)
+
+ def is_model_exists(self,
+ model_size: str):
+ """Check if model exists or not (Only facebook model)"""
+ prefix = "models--facebook--"
+ _id, model_size_name = model_size.split("/")
+ model_dir_name = prefix + model_size_name
+ model_dir_path = os.path.join(self.model_dir, model_dir_name)
+ if os.path.exists(model_dir_path) and os.listdir(model_dir_path):
+ return True
+ return False
+
+
+NLLB_AVAILABLE_LANGS = {
+ "Acehnese (Arabic script)": "ace_Arab",
+ "Acehnese (Latin script)": "ace_Latn",
+ "Mesopotamian Arabic": "acm_Arab",
+ "Taβizzi-Adeni Arabic": "acq_Arab",
+ "Tunisian Arabic": "aeb_Arab",
+ "Afrikaans": "afr_Latn",
+ "South Levantine Arabic": "ajp_Arab",
+ "Akan": "aka_Latn",
+ "Amharic": "amh_Ethi",
+ "North Levantine Arabic": "apc_Arab",
+ "Modern Standard Arabic": "arb_Arab",
+ "Modern Standard Arabic (Romanized)": "arb_Latn",
+ "Najdi Arabic": "ars_Arab",
+ "Moroccan Arabic": "ary_Arab",
+ "Egyptian Arabic": "arz_Arab",
+ "Assamese": "asm_Beng",
+ "Asturian": "ast_Latn",
+ "Awadhi": "awa_Deva",
+ "Central Aymara": "ayr_Latn",
+ "South Azerbaijani": "azb_Arab",
+ "North Azerbaijani": "azj_Latn",
+ "Bashkir": "bak_Cyrl",
+ "Bambara": "bam_Latn",
+ "Balinese": "ban_Latn",
+ "Belarusian": "bel_Cyrl",
+ "Bemba": "bem_Latn",
+ "Bengali": "ben_Beng",
+ "Bhojpuri": "bho_Deva",
+ "Banjar (Arabic script)": "bjn_Arab",
+ "Banjar (Latin script)": "bjn_Latn",
+ "Standard Tibetan": "bod_Tibt",
+ "Bosnian": "bos_Latn",
+ "Buginese": "bug_Latn",
+ "Bulgarian": "bul_Cyrl",
+ "Catalan": "cat_Latn",
+ "Cebuano": "ceb_Latn",
+ "Czech": "ces_Latn",
+ "Chokwe": "cjk_Latn",
+ "Central Kurdish": "ckb_Arab",
+ "Crimean Tatar": "crh_Latn",
+ "Welsh": "cym_Latn",
+ "Danish": "dan_Latn",
+ "German": "deu_Latn",
+ "Southwestern Dinka": "dik_Latn",
+ "Dyula": "dyu_Latn",
+ "Dzongkha": "dzo_Tibt",
+ "Greek": "ell_Grek",
+ "English": "eng_Latn",
+ "Esperanto": "epo_Latn",
+ "Estonian": "est_Latn",
+ "Basque": "eus_Latn",
+ "Ewe": "ewe_Latn",
+ "Faroese": "fao_Latn",
+ "Fijian": "fij_Latn",
+ "Finnish": "fin_Latn",
+ "Fon": "fon_Latn",
+ "French": "fra_Latn",
+ "Friulian": "fur_Latn",
+ "Nigerian Fulfulde": "fuv_Latn",
+ "Scottish Gaelic": "gla_Latn",
+ "Irish": "gle_Latn",
+ "Galician": "glg_Latn",
+ "Guarani": "grn_Latn",
+ "Gujarati": "guj_Gujr",
+ "Haitian Creole": "hat_Latn",
+ "Hausa": "hau_Latn",
+ "Hebrew": "heb_Hebr",
+ "Hindi": "hin_Deva",
+ "Chhattisgarhi": "hne_Deva",
+ "Croatian": "hrv_Latn",
+ "Hungarian": "hun_Latn",
+ "Armenian": "hye_Armn",
+ "Igbo": "ibo_Latn",
+ "Ilocano": "ilo_Latn",
+ "Indonesian": "ind_Latn",
+ "Icelandic": "isl_Latn",
+ "Italian": "ita_Latn",
+ "Javanese": "jav_Latn",
+ "Japanese": "jpn_Jpan",
+ "Kabyle": "kab_Latn",
+ "Jingpho": "kac_Latn",
+ "Kamba": "kam_Latn",
+ "Kannada": "kan_Knda",
+ "Kashmiri (Arabic script)": "kas_Arab",
+ "Kashmiri (Devanagari script)": "kas_Deva",
+ "Georgian": "kat_Geor",
+ "Central Kanuri (Arabic script)": "knc_Arab",
+ "Central Kanuri (Latin script)": "knc_Latn",
+ "Kazakh": "kaz_Cyrl",
+ "Kabiyè": "kbp_Latn",
+ "Kabuverdianu": "kea_Latn",
+ "Khmer": "khm_Khmr",
+ "Kikuyu": "kik_Latn",
+ "Kinyarwanda": "kin_Latn",
+ "Kyrgyz": "kir_Cyrl",
+ "Kimbundu": "kmb_Latn",
+ "Northern Kurdish": "kmr_Latn",
+ "Kikongo": "kon_Latn",
+ "Korean": "kor_Hang",
+ "Lao": "lao_Laoo",
+ "Ligurian": "lij_Latn",
+ "Limburgish": "lim_Latn",
+ "Lingala": "lin_Latn",
+ "Lithuanian": "lit_Latn",
+ "Lombard": "lmo_Latn",
+ "Latgalian": "ltg_Latn",
+ "Luxembourgish": "ltz_Latn",
+ "Luba-Kasai": "lua_Latn",
+ "Ganda": "lug_Latn",
+ "Luo": "luo_Latn",
+ "Mizo": "lus_Latn",
+ "Standard Latvian": "lvs_Latn",
+ "Magahi": "mag_Deva",
+ "Maithili": "mai_Deva",
+ "Malayalam": "mal_Mlym",
+ "Marathi": "mar_Deva",
+ "Minangkabau (Arabic script)": "min_Arab",
+ "Minangkabau (Latin script)": "min_Latn",
+ "Macedonian": "mkd_Cyrl",
+ "Plateau Malagasy": "plt_Latn",
+ "Maltese": "mlt_Latn",
+ "Meitei (Bengali script)": "mni_Beng",
+ "Halh Mongolian": "khk_Cyrl",
+ "Mossi": "mos_Latn",
+ "Maori": "mri_Latn",
+ "Burmese": "mya_Mymr",
+ "Dutch": "nld_Latn",
+ "Norwegian Nynorsk": "nno_Latn",
+ "Norwegian BokmΓ₯l": "nob_Latn",
+ "Nepali": "npi_Deva",
+ "Northern Sotho": "nso_Latn",
+ "Nuer": "nus_Latn",
+ "Nyanja": "nya_Latn",
+ "Occitan": "oci_Latn",
+ "West Central Oromo": "gaz_Latn",
+ "Odia": "ory_Orya",
+ "Pangasinan": "pag_Latn",
+ "Eastern Panjabi": "pan_Guru",
+ "Papiamento": "pap_Latn",
+ "Western Persian": "pes_Arab",
+ "Polish": "pol_Latn",
+ "Portuguese": "por_Latn",
+ "Dari": "prs_Arab",
+ "Southern Pashto": "pbt_Arab",
+ "Ayacucho Quechua": "quy_Latn",
+ "Romanian": "ron_Latn",
+ "Rundi": "run_Latn",
+ "Russian": "rus_Cyrl",
+ "Sango": "sag_Latn",
+ "Sanskrit": "san_Deva",
+ "Santali": "sat_Olck",
+ "Sicilian": "scn_Latn",
+ "Shan": "shn_Mymr",
+ "Sinhala": "sin_Sinh",
+ "Slovak": "slk_Latn",
+ "Slovenian": "slv_Latn",
+ "Samoan": "smo_Latn",
+ "Shona": "sna_Latn",
+ "Sindhi": "snd_Arab",
+ "Somali": "som_Latn",
+ "Southern Sotho": "sot_Latn",
+ "Spanish": "spa_Latn",
+ "Tosk Albanian": "als_Latn",
+ "Sardinian": "srd_Latn",
+ "Serbian": "srp_Cyrl",
+ "Swati": "ssw_Latn",
+ "Sundanese": "sun_Latn",
+ "Swedish": "swe_Latn",
+ "Swahili": "swh_Latn",
+ "Silesian": "szl_Latn",
+ "Tamil": "tam_Taml",
+ "Tatar": "tat_Cyrl",
+ "Telugu": "tel_Telu",
+ "Tajik": "tgk_Cyrl",
+ "Tagalog": "tgl_Latn",
+ "Thai": "tha_Thai",
+ "Tigrinya": "tir_Ethi",
+ "Tamasheq (Latin script)": "taq_Latn",
+ "Tamasheq (Tifinagh script)": "taq_Tfng",
+ "Tok Pisin": "tpi_Latn",
+ "Tswana": "tsn_Latn",
+ "Tsonga": "tso_Latn",
+ "Turkmen": "tuk_Latn",
+ "Tumbuka": "tum_Latn",
+ "Turkish": "tur_Latn",
+ "Twi": "twi_Latn",
+ "Central Atlas Tamazight": "tzm_Tfng",
+ "Uyghur": "uig_Arab",
+ "Ukrainian": "ukr_Cyrl",
+ "Umbundu": "umb_Latn",
+ "Urdu": "urd_Arab",
+ "Northern Uzbek": "uzn_Latn",
+ "Venetian": "vec_Latn",
+ "Vietnamese": "vie_Latn",
+ "Waray": "war_Latn",
+ "Wolof": "wol_Latn",
+ "Xhosa": "xho_Latn",
+ "Eastern Yiddish": "ydd_Hebr",
+ "Yoruba": "yor_Latn",
+ "Yue Chinese": "yue_Hant",
+ "Chinese (Simplified)": "zho_Hans",
+ "Chinese (Traditional)": "zho_Hant",
+ "Standard Malay": "zsm_Latn",
+ "Zulu": "zul_Latn",
+}
diff --git a/modules/translation/translation_base.py b/modules/translation/translation_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..2551f0e9e1c82d4f71fd5d3a848b56c63eea4baf
--- /dev/null
+++ b/modules/translation/translation_base.py
@@ -0,0 +1,177 @@
+import os
+import torch
+import gradio as gr
+from abc import ABC, abstractmethod
+from typing import List
+from datetime import datetime
+
+from modules.whisper.whisper_parameter import *
+from modules.utils.subtitle_manager import *
+from modules.utils.files_manager import load_yaml, save_yaml
+from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH, NLLB_MODELS_DIR, TRANSLATION_OUTPUT_DIR
+
+
+class TranslationBase(ABC):
+ def __init__(self,
+ model_dir: str = NLLB_MODELS_DIR,
+ output_dir: str = TRANSLATION_OUTPUT_DIR
+ ):
+ super().__init__()
+ self.model = None
+ self.model_dir = model_dir
+ self.output_dir = output_dir
+ os.makedirs(self.model_dir, exist_ok=True)
+ os.makedirs(self.output_dir, exist_ok=True)
+ self.current_model_size = None
+ self.device = self.get_device()
+
+ @abstractmethod
+ def translate(self,
+ text: str,
+ max_length: int
+ ):
+ pass
+
+ @abstractmethod
+ def update_model(self,
+ model_size: str,
+ src_lang: str,
+ tgt_lang: str,
+ progress: gr.Progress = gr.Progress()
+ ):
+ pass
+
+ def translate_file(self,
+ fileobjs: list,
+ model_size: str,
+ src_lang: str,
+ tgt_lang: str,
+ max_length: int = 200,
+ add_timestamp: bool = True,
+ progress=gr.Progress()) -> list:
+ """
+ Translate subtitle file from source language to target language
+
+ Parameters
+ ----------
+ fileobjs: list
+ List of files to transcribe from gr.Files()
+ model_size: str
+ Whisper model size from gr.Dropdown()
+ src_lang: str
+ Source language of the file to translate from gr.Dropdown()
+ tgt_lang: str
+ Target language of the file to translate from gr.Dropdown()
+ max_length: int
+ Max length per line to translate
+ add_timestamp: bool
+ Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+ progress: gr.Progress
+ Indicator to show progress directly in gradio.
+ I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
+
+ Returns
+ ----------
+ A List of
+ String to return to gr.Textbox()
+ Files to return to gr.Files()
+ """
+ try:
+ if fileobjs and isinstance(fileobjs[0], gr.utils.NamedString):
+ fileobjs = [file.name for file in fileobjs]
+
+ self.cache_parameters(model_size=model_size,
+ src_lang=src_lang,
+ tgt_lang=tgt_lang,
+ max_length=max_length,
+ add_timestamp=add_timestamp)
+
+ self.update_model(model_size=model_size,
+ src_lang=src_lang,
+ tgt_lang=tgt_lang,
+ progress=progress)
+
+ files_info = {}
+ for fileobj in fileobjs:
+ file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
+ if file_ext == ".srt":
+ parsed_dicts = parse_srt(file_path=fileobj)
+ total_progress = len(parsed_dicts)
+ for index, dic in enumerate(parsed_dicts):
+ progress(index / total_progress, desc="Translating..")
+ translated_text = self.translate(dic["sentence"], max_length=max_length)
+ dic["sentence"] = translated_text
+ subtitle = get_serialized_srt(parsed_dicts)
+
+ elif file_ext == ".vtt":
+ parsed_dicts = parse_vtt(file_path=fileobj)
+ total_progress = len(parsed_dicts)
+ for index, dic in enumerate(parsed_dicts):
+ progress(index / total_progress, desc="Translating..")
+ translated_text = self.translate(dic["sentence"], max_length=max_length)
+ dic["sentence"] = translated_text
+ subtitle = get_serialized_vtt(parsed_dicts)
+
+ if add_timestamp:
+ timestamp = datetime.now().strftime("%m%d%H%M%S")
+ file_name += f"-{timestamp}"
+
+ output_path = os.path.join(self.output_dir, f"{file_name}{file_ext}")
+ write_file(subtitle, output_path)
+
+ files_info[file_name] = {"subtitle": subtitle, "path": output_path}
+
+ total_result = ''
+ for file_name, info in files_info.items():
+ total_result += '------------------------------------\n'
+ total_result += f'{file_name}\n\n'
+ total_result += f'{info["subtitle"]}'
+ gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
+
+ output_file_paths = [item["path"] for key, item in files_info.items()]
+ return [gr_str, output_file_paths]
+
+ except Exception as e:
+ print(f"Error: {str(e)}")
+ finally:
+ self.release_cuda_memory()
+
+ @staticmethod
+ def get_device():
+ if torch.cuda.is_available():
+ return "cuda"
+ elif torch.backends.mps.is_available():
+ return "mps"
+ else:
+ return "cpu"
+
+ @staticmethod
+ def release_cuda_memory():
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ torch.cuda.reset_max_memory_allocated()
+
+ @staticmethod
+ def remove_input_files(file_paths: List[str]):
+ if not file_paths:
+ return
+
+ for file_path in file_paths:
+ if file_path and os.path.exists(file_path):
+ os.remove(file_path)
+
+ @staticmethod
+ def cache_parameters(model_size: str,
+ src_lang: str,
+ tgt_lang: str,
+ max_length: int,
+ add_timestamp: bool):
+ cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
+ cached_params["translation"]["nllb"] = {
+ "model_size": model_size,
+ "source_lang": src_lang,
+ "target_lang": tgt_lang,
+ "max_length": max_length,
+ }
+ cached_params["translation"]["add_timestamp"] = add_timestamp
+ save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)
diff --git a/modules/ui/__init__.py b/modules/ui/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/modules/ui/htmls.py b/modules/ui/htmls.py
new file mode 100644
index 0000000000000000000000000000000000000000..241705344a869259e3873e7ecbaef9a1ab883442
--- /dev/null
+++ b/modules/ui/htmls.py
@@ -0,0 +1,97 @@
+CSS = """
+.bmc-button {
+ padding: 2px 5px;
+ border-radius: 5px;
+ background-color: #FF813F;
+ color: white;
+ box-shadow: 0px 1px 2px rgba(0, 0, 0, 0.3);
+ text-decoration: none;
+ display: inline-block;
+ font-size: 20px;
+ margin: 2px;
+ cursor: pointer;
+ -webkit-transition: background-color 0.3s ease;
+ -ms-transition: background-color 0.3s ease;
+ transition: background-color 0.3s ease;
+}
+.bmc-button:hover,
+.bmc-button:active,
+.bmc-button:focus {
+ background-color: #FF5633;
+}
+.markdown {
+ margin-bottom: 0;
+ padding-bottom: 0;
+}
+.tabs {
+ margin-top: 0;
+ padding-top: 0;
+}
+
+#md_project a {
+ color: black;
+ text-decoration: none;
+}
+#md_project a:hover {
+ text-decoration: underline;
+}
+"""
+
+MARKDOWN = """
+# Automatic speech recognition
+"""
+
+
+NLLB_VRAM_TABLE = """
+
+
+
Model name | +Required VRAM | +
---|---|
nllb-200-3.3B | +~16GB | +
nllb-200-1.3B | +~8GB | +
nllb-200-distilled-600M | +~4GB | +
Note: Be mindful of your VRAM! The table above provides an approximate VRAM usage for each model.
+