{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "5a3ddcc8", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\catto\\anaconda3\\envs\\test_env\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "[nltk_data] Downloading package punkt_tab to\n", "[nltk_data] C:\\Users\\catto\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package punkt_tab is already up-to-date!\n" ] } ], "source": [ "from inference import StyleTTS2\n", "\n", "import librosa\n", "import IPython.display as ipd\n", "import torch.cuda\n", "\n", "device = 'cuda' if torch.cuda.is_available() else 'cpu'" ] }, { "cell_type": "markdown", "id": "7b9cecbe", "metadata": {}, "source": [ "### Load models" ] }, { "cell_type": "code", "execution_count": null, "id": "e7b9c01d", "metadata": {}, "outputs": [], "source": [ "config_path = \"Models/config.yaml\"\n", "models_path = \"Models/model.pth\"" ] }, { "cell_type": "markdown", "id": "b803110e", "metadata": {}, "source": [ "### Synthesize speech\n", "\n", "Little Note:\n", "\n", "- You don't need to add language tokens everywhere, espeak can detect and handle them automatically most of the time.\n", "\n", "- Reference audio has a huge impact on the result. It is best to select audio around 10s long and consistent in both tone and speed." ] }, { "cell_type": "code", "execution_count": null, "id": "78396f70", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "./reference_audio/vn_3.wav\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "./reference_audio/vn_4.wav\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "speakers = {\n", " \"id_1\": {\n", " \"path\": \"./reference_audio/vn_3.wav\", #Ref audio path\n", " \"lang\": \"vi\", #Default language\n", " \"speed\": 1.0, #Speaking speed\n", " },\n", " \"id_2\": {\n", " \"path\": \"./reference_audio/vn_4.wav\",\n", " \"lang\": \"vi\",\n", " \"speed\": 1.0,\n", " },\n", "}\n", "for id in speakers:\n", " max_samples = 24000*20 #max 20 seconds ref audio\n", " print(speakers[id]['path'])\n", " wave, sr = librosa.load(speakers[id]['path'], sr=24000)\n", " audio, index = librosa.effects.trim(wave, top_db=30)\n", " if sr != 24000: audio = librosa.resample(audio, sr, 24000)\n", " if len(audio) > max_samples: audio = audio[:max_samples]\n", " display(ipd.Audio(audio, rate=24000, normalize=True))" ] }, { "cell_type": "code", "execution_count": 4, "id": "395959f1", "metadata": {}, "outputs": [], "source": [ "text = '''\n", "[id_1][en-us]{What's up hommie}, dạo này đang học tí [en-us]{English}. Thấy bảo [en-us]{Building a strong vocabulary} khá là quan trọng. [en-us]{Bro} thấy sao?\n", "\n", "[id_2][en-us]{That's right}, tôi thấy [en-us]{bro} nên bắt đầu với việc đọc sách và báo tiếng Anh để quen với cách sử dụng từ, cũng như tập trung vào [en-us]{listening exercises} để cải thiện khả năng nghe.\n", "\n", "[id_1]Nghe nói rằng [en-us]{speaking practice} là bước quan trọng để giao tiếp tự tin. [en-us]{Bro} có muốn luyện tập với tôi không?\n", "\n", "[id_2][en-us]{For sure my hommie} à, cứ cho mình cái hẹn nhé.\n", "'''" ] }, { "cell_type": "code", "execution_count": null, "id": "16194211", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\catto\\anaconda3\\envs\\test_env\\Lib\\site-packages\\torch\\nn\\utils\\weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n", " WeightNorm.apply(module, name, dim)\n", "c:\\Users\\catto\\anaconda3\\envs\\test_env\\Lib\\site-packages\\torch\\nn\\modules\\rnn.py:123: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.2 and num_layers=1\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "decoder : 54289492\n", "predictor : 16194612\n", "text_encoder : 5612032\n", "style_encoder : 13845440\n", "\n", "Total : 89941576\n" ] } ], "source": [ "model = StyleTTS2(config_path, models_path).eval().to(device)\n", "default_speaker = \"[id_1]\" #STR Default speaker used when no speaker_id is provided in the input\n", "avg_style = True #BOOL Split the ref audio and calculate the avg styles.\n", "stabilize = True #BOOL Stabilize speaking speed.\n", "denoise = 0.6 #FLOAT Adjust the strength of the denoiser. Value range is [0, 1]\n", "n_merge = 18 #INT Avoid short sentences by merging when a sentence has fewer than n words" ] }, { "cell_type": "code", "execution_count": 6, "id": "d98bdb71", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Computing the style for: ./reference_audio/vn_3.wav\n", "Computing the style for: ./reference_audio/vn_4.wav\n", "Generating Audio...\n", "Synthesized:\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "with torch.no_grad():\n", " styles = model.get_styles(speakers, denoise, avg_style)\n", " r = model.generate(text, styles, stabilize, n_merge, default_speaker)\n", "\n", "print('Synthesized:')\n", "display(ipd.Audio(r, rate=24000, normalize=True))" ] } ], "metadata": { "kernelspec": { "display_name": "test_env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 5 }