{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "5a3ddcc8",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\catto\\anaconda3\\envs\\test_env\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"[nltk_data] Downloading package punkt_tab to\n",
"[nltk_data] C:\\Users\\catto\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt_tab is already up-to-date!\n"
]
}
],
"source": [
"from inference import StyleTTS2\n",
"\n",
"import librosa\n",
"import IPython.display as ipd\n",
"import torch.cuda\n",
"\n",
"device = 'cuda' if torch.cuda.is_available() else 'cpu'"
]
},
{
"cell_type": "markdown",
"id": "7b9cecbe",
"metadata": {},
"source": [
"### Load models"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e7b9c01d",
"metadata": {},
"outputs": [],
"source": [
"config_path = \"Models/config.yaml\"\n",
"models_path = \"Models/model.pth\""
]
},
{
"cell_type": "markdown",
"id": "b803110e",
"metadata": {},
"source": [
"### Synthesize speech\n",
"\n",
"Little Note:\n",
"\n",
"- You don't need to add language tokens everywhere, espeak can detect and handle them automatically most of the time.\n",
"\n",
"- Reference audio has a huge impact on the result. It is best to select audio around 10s long and consistent in both tone and speed."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "78396f70",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"./reference_audio/vn_3.wav\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"./reference_audio/vn_4.wav\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"speakers = {\n",
" \"id_1\": {\n",
" \"path\": \"./reference_audio/vn_3.wav\", #Ref audio path\n",
" \"lang\": \"vi\", #Default language\n",
" \"speed\": 1.0, #Speaking speed\n",
" },\n",
" \"id_2\": {\n",
" \"path\": \"./reference_audio/vn_4.wav\",\n",
" \"lang\": \"vi\",\n",
" \"speed\": 1.0,\n",
" },\n",
"}\n",
"for id in speakers:\n",
" max_samples = 24000*20 #max 20 seconds ref audio\n",
" print(speakers[id]['path'])\n",
" wave, sr = librosa.load(speakers[id]['path'], sr=24000)\n",
" audio, index = librosa.effects.trim(wave, top_db=30)\n",
" if sr != 24000: audio = librosa.resample(audio, sr, 24000)\n",
" if len(audio) > max_samples: audio = audio[:max_samples]\n",
" display(ipd.Audio(audio, rate=24000, normalize=True))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "395959f1",
"metadata": {},
"outputs": [],
"source": [
"text = '''\n",
"[id_1][en-us]{What's up hommie}, dạo này đang học tí [en-us]{English}. Thấy bảo [en-us]{Building a strong vocabulary} khá là quan trọng. [en-us]{Bro} thấy sao?\n",
"\n",
"[id_2][en-us]{That's right}, tôi thấy [en-us]{bro} nên bắt đầu với việc đọc sách và báo tiếng Anh để quen với cách sử dụng từ, cũng như tập trung vào [en-us]{listening exercises} để cải thiện khả năng nghe.\n",
"\n",
"[id_1]Nghe nói rằng [en-us]{speaking practice} là bước quan trọng để giao tiếp tự tin. [en-us]{Bro} có muốn luyện tập với tôi không?\n",
"\n",
"[id_2][en-us]{For sure my hommie} à, cứ cho mình cái hẹn nhé.\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "16194211",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\catto\\anaconda3\\envs\\test_env\\Lib\\site-packages\\torch\\nn\\utils\\weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n",
" WeightNorm.apply(module, name, dim)\n",
"c:\\Users\\catto\\anaconda3\\envs\\test_env\\Lib\\site-packages\\torch\\nn\\modules\\rnn.py:123: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.2 and num_layers=1\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"decoder : 54289492\n",
"predictor : 16194612\n",
"text_encoder : 5612032\n",
"style_encoder : 13845440\n",
"\n",
"Total : 89941576\n"
]
}
],
"source": [
"model = StyleTTS2(config_path, models_path).eval().to(device)\n",
"default_speaker = \"[id_1]\" #STR Default speaker used when no speaker_id is provided in the input\n",
"avg_style = True #BOOL Split the ref audio and calculate the avg styles.\n",
"stabilize = True #BOOL Stabilize speaking speed.\n",
"denoise = 0.6 #FLOAT Adjust the strength of the denoiser. Value range is [0, 1]\n",
"n_merge = 18 #INT Avoid short sentences by merging when a sentence has fewer than n words"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d98bdb71",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Computing the style for: ./reference_audio/vn_3.wav\n",
"Computing the style for: ./reference_audio/vn_4.wav\n",
"Generating Audio...\n",
"Synthesized:\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"with torch.no_grad():\n",
" styles = model.get_styles(speakers, denoise, avg_style)\n",
" r = model.generate(text, styles, stabilize, n_merge, default_speaker)\n",
"\n",
"print('Synthesized:')\n",
"display(ipd.Audio(r, rate=24000, normalize=True))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "test_env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}