dangtr0408 commited on
Commit
c88d036
·
verified ·
1 Parent(s): ce64491

init hugging face

Browse files
.gitattributes CHANGED
@@ -1,35 +1,36 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __pycache__/inference.cpython-311.pyc
2
+ __pycache__/models.cpython-311.pyc
3
+ Modules/__pycache__/__init__.cpython-311.pyc
4
+ Modules/__pycache__/hifigan.cpython-311.pyc
5
+ Modules/__pycache__/utils.cpython-311.pyc
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Aaron (Yinghao) Li
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
Models/config.yml ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "Models/Finetune_Extend"
2
+ save_freq: 1
3
+ log_interval: 5
4
+ device: "cuda"
5
+ epochs: 50
6
+ batch_size: 3
7
+ max_len: 210 # maximum number of frames
8
+ pretrained_model: "Models/Finetune_Extend/current_model.pth"
9
+ load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
10
+
11
+ data_params:
12
+ train_data: "../../Data_Speech/viVoice/train.txt"
13
+ val_data: "../../Data_Speech/combine/combine_val.txt"
14
+ root_path: "../../Data_Speech/"
15
+ min_length: 50 # sample until texts with this size are obtained for OOD texts
16
+
17
+ preprocess_params:
18
+ sr: 24000
19
+ spect_params:
20
+ n_fft: 2048
21
+ win_length: 1200
22
+ hop_length: 300
23
+
24
+ model_params:
25
+ dim_in: 64
26
+ hidden_dim: 512
27
+ max_conv_dim: 512
28
+ n_layer: 3
29
+ n_mels: 80
30
+
31
+ n_token: 189 # number of phoneme tokens
32
+ max_dur: 50 # maximum duration of a single phoneme
33
+ style_dim: 128 # style vector size
34
+
35
+ dropout: 0.2
36
+
37
+ ASR_params:
38
+ input_dim: 80
39
+ hidden_dim: 256
40
+ n_token: 189 # number of phoneme tokens
41
+ n_layers: 6
42
+ token_embedding_dim: 512
43
+
44
+ JDC_params:
45
+ num_class: 1
46
+ seq_len: 192
47
+
48
+ # config for decoder
49
+ decoder:
50
+ type: 'hifigan' # either hifigan or istftnet
51
+ resblock_kernel_sizes: [3,7,11]
52
+ upsample_rates : [10,5,3,2]
53
+ upsample_initial_channel: 512
54
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
55
+ upsample_kernel_sizes: [20,10,6,4]
56
+
57
+ loss_params:
58
+ lambda_mel: 5. # mel reconstruction loss
59
+ lambda_gen: 1. # generator loss
60
+
61
+ lambda_mono: 1. # monotonic alignment loss (TMA)
62
+ lambda_s2s: 1. # sequence-to-sequence loss (TMA)
63
+
64
+ lambda_F0: 1. # F0 reconstruction loss
65
+ lambda_norm: 1. # norm reconstruction loss
66
+ lambda_dur: 1. # duration loss
67
+ lambda_ce: 20. # duration predictor probability output CE loss
68
+
69
+ optimizer_params:
70
+ lr: 0.0001 # general learning rate
71
+ ft_lr: 0.00001 # learning rate for acoustic modules
Models/del_training.ipynb ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "2b6bb4be",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import os\n",
11
+ "import torch"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": null,
17
+ "id": "dc802b47",
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "models_path = \"./current_model_120k_vi.pth\"\n",
22
+ "name = \"./model.pth\"\n",
23
+ "params_whole = torch.load(models_path, map_location='cpu')\n",
24
+ "\n",
25
+ "for key in list(params_whole.keys()):\n",
26
+ " if key != 'net':\n",
27
+ " params_whole.pop(key)\n",
28
+ "\n",
29
+ "keep = ['decoder', 'predictor', 'text_encoder', 'style_encoder']\n",
30
+ "for module_name in list(params_whole['net'].keys()):\n",
31
+ " if module_name not in keep:\n",
32
+ " params_whole['net'].pop(module_name)\n",
33
+ "\n",
34
+ "torch.save(params_whole, name)\n",
35
+ "\n",
36
+ "\n",
37
+ "os.remove(models_path)"
38
+ ]
39
+ }
40
+ ],
41
+ "metadata": {
42
+ "kernelspec": {
43
+ "display_name": "base",
44
+ "language": "python",
45
+ "name": "python3"
46
+ },
47
+ "language_info": {
48
+ "codemirror_mode": {
49
+ "name": "ipython",
50
+ "version": 3
51
+ },
52
+ "file_extension": ".py",
53
+ "mimetype": "text/x-python",
54
+ "name": "python",
55
+ "nbconvert_exporter": "python",
56
+ "pygments_lexer": "ipython3",
57
+ "version": "3.11.7"
58
+ }
59
+ },
60
+ "nbformat": 4,
61
+ "nbformat_minor": 5
62
+ }
Models/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:997e420474c1be8d1f09a70689c444105d47574a7be65ec221d61c5c2caaf8c0
3
+ size 360061639
Modules/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
Modules/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (186 Bytes). View file
 
Modules/__pycache__/hifigan.cpython-311.pyc ADDED
Binary file (30.1 kB). View file
 
Modules/__pycache__/utils.cpython-311.pyc ADDED
Binary file (1.19 kB). View file
 
Modules/hifigan.py ADDED
@@ -0,0 +1,477 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ import torch.nn as nn
4
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
5
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
6
+ from .utils import init_weights, get_padding
7
+
8
+ import math
9
+ import random
10
+ import numpy as np
11
+
12
+ LRELU_SLOPE = 0.1
13
+
14
+ class AdaIN1d(nn.Module):
15
+ def __init__(self, style_dim, num_features):
16
+ super().__init__()
17
+ self.norm = nn.InstanceNorm1d(num_features, affine=False)
18
+ self.fc = nn.Linear(style_dim, num_features*2)
19
+
20
+ def forward(self, x, s):
21
+ h = self.fc(s)
22
+ h = h.view(h.size(0), h.size(1), 1)
23
+ gamma, beta = torch.chunk(h, chunks=2, dim=1)
24
+ return (1 + gamma) * self.norm(x) + beta
25
+
26
+ class AdaINResBlock1(torch.nn.Module):
27
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64):
28
+ super(AdaINResBlock1, self).__init__()
29
+ self.convs1 = nn.ModuleList([
30
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
31
+ padding=get_padding(kernel_size, dilation[0]))),
32
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
33
+ padding=get_padding(kernel_size, dilation[1]))),
34
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
35
+ padding=get_padding(kernel_size, dilation[2])))
36
+ ])
37
+ self.convs1.apply(init_weights)
38
+
39
+ self.convs2 = nn.ModuleList([
40
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
41
+ padding=get_padding(kernel_size, 1))),
42
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
43
+ padding=get_padding(kernel_size, 1))),
44
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
45
+ padding=get_padding(kernel_size, 1)))
46
+ ])
47
+ self.convs2.apply(init_weights)
48
+
49
+ self.adain1 = nn.ModuleList([
50
+ AdaIN1d(style_dim, channels),
51
+ AdaIN1d(style_dim, channels),
52
+ AdaIN1d(style_dim, channels),
53
+ ])
54
+
55
+ self.adain2 = nn.ModuleList([
56
+ AdaIN1d(style_dim, channels),
57
+ AdaIN1d(style_dim, channels),
58
+ AdaIN1d(style_dim, channels),
59
+ ])
60
+
61
+ self.alpha1 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs1))])
62
+ self.alpha2 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs2))])
63
+
64
+
65
+ def forward(self, x, s):
66
+ for c1, c2, n1, n2, a1, a2 in zip(self.convs1, self.convs2, self.adain1, self.adain2, self.alpha1, self.alpha2):
67
+ xt = n1(x, s)
68
+ xt = xt + (1 / a1) * (torch.sin(a1 * xt) ** 2) # Snake1D
69
+ xt = c1(xt)
70
+ xt = n2(xt, s)
71
+ xt = xt + (1 / a2) * (torch.sin(a2 * xt) ** 2) # Snake1D
72
+ xt = c2(xt)
73
+ x = xt + x
74
+ return x
75
+
76
+ def remove_weight_norm(self):
77
+ for l in self.convs1:
78
+ remove_weight_norm(l)
79
+ for l in self.convs2:
80
+ remove_weight_norm(l)
81
+
82
+ class SineGen(torch.nn.Module):
83
+ """ Definition of sine generator
84
+ SineGen(samp_rate, harmonic_num = 0,
85
+ sine_amp = 0.1, noise_std = 0.003,
86
+ voiced_threshold = 0,
87
+ flag_for_pulse=False)
88
+ samp_rate: sampling rate in Hz
89
+ harmonic_num: number of harmonic overtones (default 0)
90
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
91
+ noise_std: std of Gaussian noise (default 0.003)
92
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
93
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
94
+ Note: when flag_for_pulse is True, the first time step of a voiced
95
+ segment is always sin(np.pi) or cos(0)
96
+ """
97
+
98
+ def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
99
+ sine_amp=0.1, noise_std=0.003,
100
+ voiced_threshold=0,
101
+ flag_for_pulse=False):
102
+ super(SineGen, self).__init__()
103
+ self.sine_amp = sine_amp
104
+ self.noise_std = noise_std
105
+ self.harmonic_num = harmonic_num
106
+ self.dim = self.harmonic_num + 1
107
+ self.sampling_rate = samp_rate
108
+ self.voiced_threshold = voiced_threshold
109
+ self.flag_for_pulse = flag_for_pulse
110
+ self.upsample_scale = upsample_scale
111
+
112
+ def _f02uv(self, f0):
113
+ # generate uv signal
114
+ uv = (f0 > self.voiced_threshold).type(torch.float32)
115
+ return uv
116
+
117
+ def _f02sine(self, f0_values):
118
+ """ f0_values: (batchsize, length, dim)
119
+ where dim indicates fundamental tone and overtones
120
+ """
121
+ # convert to F0 in rad. The interger part n can be ignored
122
+ # because 2 * np.pi * n doesn't affect phase
123
+ rad_values = (f0_values / self.sampling_rate) % 1
124
+
125
+ # initial phase noise (no noise for fundamental component)
126
+ rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \
127
+ device=f0_values.device)
128
+ rand_ini[:, 0] = 0
129
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
130
+
131
+ # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
132
+ if not self.flag_for_pulse:
133
+ # # for normal case
134
+
135
+ # # To prevent torch.cumsum numerical overflow,
136
+ # # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
137
+ # # Buffer tmp_over_one_idx indicates the time step to add -1.
138
+ # # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
139
+ # tmp_over_one = torch.cumsum(rad_values, 1) % 1
140
+ # tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
141
+ # cumsum_shift = torch.zeros_like(rad_values)
142
+ # cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
143
+
144
+ # phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
145
+ rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
146
+ scale_factor=1/self.upsample_scale,
147
+ mode="linear").transpose(1, 2)
148
+
149
+ # tmp_over_one = torch.cumsum(rad_values, 1) % 1
150
+ # tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
151
+ # cumsum_shift = torch.zeros_like(rad_values)
152
+ # cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
153
+
154
+ phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
155
+ phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
156
+ scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
157
+ sines = torch.sin(phase)
158
+
159
+ else:
160
+ # If necessary, make sure that the first time step of every
161
+ # voiced segments is sin(pi) or cos(0)
162
+ # This is used for pulse-train generation
163
+
164
+ # identify the last time step in unvoiced segments
165
+ uv = self._f02uv(f0_values)
166
+ uv_1 = torch.roll(uv, shifts=-1, dims=1)
167
+ uv_1[:, -1, :] = 1
168
+ u_loc = (uv < 1) * (uv_1 > 0)
169
+
170
+ # get the instantanouse phase
171
+ tmp_cumsum = torch.cumsum(rad_values, dim=1)
172
+ # different batch needs to be processed differently
173
+ for idx in range(f0_values.shape[0]):
174
+ temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
175
+ temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
176
+ # stores the accumulation of i.phase within
177
+ # each voiced segments
178
+ tmp_cumsum[idx, :, :] = 0
179
+ tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
180
+
181
+ # rad_values - tmp_cumsum: remove the accumulation of i.phase
182
+ # within the previous voiced segment.
183
+ i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
184
+
185
+ # get the sines
186
+ sines = torch.cos(i_phase * 2 * np.pi)
187
+ return sines
188
+
189
+ def forward(self, f0):
190
+ """ sine_tensor, uv = forward(f0)
191
+ input F0: tensor(batchsize=1, length, dim=1)
192
+ f0 for unvoiced steps should be 0
193
+ output sine_tensor: tensor(batchsize=1, length, dim)
194
+ output uv: tensor(batchsize=1, length, 1)
195
+ """
196
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,
197
+ device=f0.device)
198
+ # fundamental component
199
+ fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
200
+
201
+ # generate sine waveforms
202
+ sine_waves = self._f02sine(fn) * self.sine_amp
203
+
204
+ # generate uv signal
205
+ # uv = torch.ones(f0.shape)
206
+ # uv = uv * (f0 > self.voiced_threshold)
207
+ uv = self._f02uv(f0)
208
+
209
+ # noise: for unvoiced should be similar to sine_amp
210
+ # std = self.sine_amp/3 -> max value ~ self.sine_amp
211
+ # . for voiced regions is self.noise_std
212
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
213
+ noise = noise_amp * torch.randn_like(sine_waves)
214
+
215
+ # first: set the unvoiced part to 0 by uv
216
+ # then: additive noise
217
+ sine_waves = sine_waves * uv + noise
218
+ return sine_waves, uv, noise
219
+
220
+
221
+ class SourceModuleHnNSF(torch.nn.Module):
222
+ """ SourceModule for hn-nsf
223
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
224
+ add_noise_std=0.003, voiced_threshod=0)
225
+ sampling_rate: sampling_rate in Hz
226
+ harmonic_num: number of harmonic above F0 (default: 0)
227
+ sine_amp: amplitude of sine source signal (default: 0.1)
228
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
229
+ note that amplitude of noise in unvoiced is decided
230
+ by sine_amp
231
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
232
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
233
+ F0_sampled (batchsize, length, 1)
234
+ Sine_source (batchsize, length, 1)
235
+ noise_source (batchsize, length 1)
236
+ uv (batchsize, length, 1)
237
+ """
238
+
239
+ def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
240
+ add_noise_std=0.003, voiced_threshod=0):
241
+ super(SourceModuleHnNSF, self).__init__()
242
+
243
+ self.sine_amp = sine_amp
244
+ self.noise_std = add_noise_std
245
+
246
+ # to produce sine waveforms
247
+ self.l_sin_gen = SineGen(sampling_rate, upsample_scale, harmonic_num,
248
+ sine_amp, add_noise_std, voiced_threshod)
249
+
250
+ # to merge source harmonics into a single excitation
251
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
252
+ self.l_tanh = torch.nn.Tanh()
253
+
254
+ def forward(self, x):
255
+ """
256
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
257
+ F0_sampled (batchsize, length, 1)
258
+ Sine_source (batchsize, length, 1)
259
+ noise_source (batchsize, length 1)
260
+ """
261
+ # source for harmonic branch
262
+ with torch.no_grad():
263
+ sine_wavs, uv, _ = self.l_sin_gen(x)
264
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
265
+
266
+ # source for noise branch, in the same shape as uv
267
+ noise = torch.randn_like(uv) * self.sine_amp / 3
268
+ return sine_merge, noise, uv
269
+ def padDiff(x):
270
+ return F.pad(F.pad(x, (0,0,-1,1), 'constant', 0) - x, (0,0,0,-1), 'constant', 0)
271
+
272
+ class Generator(torch.nn.Module):
273
+ def __init__(self, style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes):
274
+ super(Generator, self).__init__()
275
+ self.num_kernels = len(resblock_kernel_sizes)
276
+ self.num_upsamples = len(upsample_rates)
277
+ resblock = AdaINResBlock1
278
+
279
+ self.m_source = SourceModuleHnNSF(
280
+ sampling_rate=24000,
281
+ upsample_scale=np.prod(upsample_rates),
282
+ harmonic_num=8, voiced_threshod=10)
283
+
284
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
285
+ self.noise_convs = nn.ModuleList()
286
+ self.ups = nn.ModuleList()
287
+ self.noise_res = nn.ModuleList()
288
+
289
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
290
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
291
+
292
+ self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel//(2**i),
293
+ upsample_initial_channel//(2**(i+1)),
294
+ k, u, padding=(u//2 + u%2), output_padding=u%2)))
295
+
296
+ if i + 1 < len(upsample_rates): #
297
+ stride_f0 = np.prod(upsample_rates[i + 1:])
298
+ self.noise_convs.append(Conv1d(
299
+ 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2))
300
+ self.noise_res.append(resblock(c_cur, 7, [1,3,5], style_dim))
301
+ else:
302
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
303
+ self.noise_res.append(resblock(c_cur, 11, [1,3,5], style_dim))
304
+
305
+ self.resblocks = nn.ModuleList()
306
+
307
+ self.alphas = nn.ParameterList()
308
+ self.alphas.append(nn.Parameter(torch.ones(1, upsample_initial_channel, 1)))
309
+
310
+ for i in range(len(self.ups)):
311
+ ch = upsample_initial_channel//(2**(i+1))
312
+ self.alphas.append(nn.Parameter(torch.ones(1, ch, 1)))
313
+
314
+ for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
315
+ self.resblocks.append(resblock(ch, k, d, style_dim))
316
+
317
+ self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
318
+ self.ups.apply(init_weights)
319
+ self.conv_post.apply(init_weights)
320
+
321
+ def forward(self, x, s, f0):
322
+
323
+ f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
324
+
325
+ har_source, noi_source, uv = self.m_source(f0)
326
+ har_source = har_source.transpose(1, 2)
327
+
328
+ for i in range(self.num_upsamples):
329
+ x = x + (1 / self.alphas[i]) * (torch.sin(self.alphas[i] * x) ** 2)
330
+ x_source = self.noise_convs[i](har_source)
331
+ x_source = self.noise_res[i](x_source, s)
332
+
333
+ x = self.ups[i](x)
334
+ x = x + x_source
335
+
336
+ xs = None
337
+ for j in range(self.num_kernels):
338
+ if xs is None:
339
+ xs = self.resblocks[i*self.num_kernels+j](x, s)
340
+ else:
341
+ xs += self.resblocks[i*self.num_kernels+j](x, s)
342
+ x = xs / self.num_kernels
343
+ x = x + (1 / self.alphas[i+1]) * (torch.sin(self.alphas[i+1] * x) ** 2)
344
+ x = self.conv_post(x)
345
+ x = torch.tanh(x)
346
+
347
+ return x
348
+
349
+ def remove_weight_norm(self):
350
+ print('Removing weight norm...')
351
+ for l in self.ups:
352
+ remove_weight_norm(l)
353
+ for l in self.resblocks:
354
+ l.remove_weight_norm()
355
+ remove_weight_norm(self.conv_pre)
356
+ remove_weight_norm(self.conv_post)
357
+
358
+
359
+ class AdainResBlk1d(nn.Module):
360
+ def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
361
+ upsample='none', dropout_p=0.0):
362
+ super().__init__()
363
+ self.actv = actv
364
+ self.upsample_type = upsample
365
+ self.upsample = UpSample1d(upsample)
366
+ self.learned_sc = dim_in != dim_out
367
+ self._build_weights(dim_in, dim_out, style_dim)
368
+ self.dropout = nn.Dropout(dropout_p)
369
+
370
+ if upsample == 'none':
371
+ self.pool = nn.Identity()
372
+ else:
373
+ self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
374
+
375
+
376
+ def _build_weights(self, dim_in, dim_out, style_dim):
377
+ self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
378
+ self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
379
+ self.norm1 = AdaIN1d(style_dim, dim_in)
380
+ self.norm2 = AdaIN1d(style_dim, dim_out)
381
+ if self.learned_sc:
382
+ self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
383
+
384
+ def _shortcut(self, x):
385
+ x = self.upsample(x)
386
+ if self.learned_sc:
387
+ x = self.conv1x1(x)
388
+ return x
389
+
390
+ def _residual(self, x, s):
391
+ x = self.norm1(x, s)
392
+ x = self.actv(x)
393
+ x = self.pool(x)
394
+ x = self.conv1(self.dropout(x))
395
+ x = self.norm2(x, s)
396
+ x = self.actv(x)
397
+ x = self.conv2(self.dropout(x))
398
+ return x
399
+
400
+ def forward(self, x, s):
401
+ out = self._residual(x, s)
402
+ out = (out + self._shortcut(x)) / math.sqrt(2)
403
+ return out
404
+
405
+ class UpSample1d(nn.Module):
406
+ def __init__(self, layer_type):
407
+ super().__init__()
408
+ self.layer_type = layer_type
409
+
410
+ def forward(self, x):
411
+ if self.layer_type == 'none':
412
+ return x
413
+ else:
414
+ return F.interpolate(x, scale_factor=2, mode='nearest')
415
+
416
+ class Decoder(nn.Module):
417
+ def __init__(self, dim_in=512, F0_channel=512, style_dim=64, dim_out=80,
418
+ resblock_kernel_sizes = [3,7,11],
419
+ upsample_rates = [10,5,3,2],
420
+ upsample_initial_channel=512,
421
+ resblock_dilation_sizes=[[1,3,5], [1,3,5], [1,3,5]],
422
+ upsample_kernel_sizes=[20,10,6,4]):
423
+ super().__init__()
424
+
425
+ self.decode = nn.ModuleList()
426
+
427
+ self.encode = AdainResBlk1d(dim_in + 2, 1024, style_dim)
428
+
429
+ self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
430
+ self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
431
+ self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
432
+ self.decode.append(AdainResBlk1d(1024 + 2 + 64, 512, style_dim, upsample=True))
433
+
434
+ self.F0_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
435
+
436
+ self.N_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
437
+
438
+ self.asr_res = nn.Sequential(
439
+ weight_norm(nn.Conv1d(512, 64, kernel_size=1)),
440
+ )
441
+
442
+
443
+ self.generator = Generator(style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes)
444
+
445
+
446
+ def forward(self, asr, F0_curve, N, s):
447
+ if self.training:
448
+ downlist = [0, 3, 7]
449
+ F0_down = downlist[random.randint(0, 2)]
450
+ downlist = [0, 3, 7, 15]
451
+ N_down = downlist[random.randint(0, 3)]
452
+ if F0_down:
453
+ F0_curve = nn.functional.conv1d(F0_curve.unsqueeze(1), torch.ones(1, 1, F0_down).to(asr.device), padding=F0_down//2).squeeze(1) / F0_down
454
+ if N_down:
455
+ N = nn.functional.conv1d(N.unsqueeze(1), torch.ones(1, 1, N_down).to(asr.device), padding=N_down//2).squeeze(1) / N_down
456
+
457
+
458
+ F0 = self.F0_conv(F0_curve.unsqueeze(1))
459
+ N = self.N_conv(N.unsqueeze(1))
460
+
461
+ x = torch.cat([asr, F0, N], axis=1)
462
+ x = self.encode(x, s)
463
+
464
+ asr_res = self.asr_res(asr)
465
+
466
+ res = True
467
+ for block in self.decode:
468
+ if res:
469
+ x = torch.cat([x, asr_res, F0, N], axis=1)
470
+ x = block(x, s)
471
+ if block.upsample_type != "none":
472
+ res = False
473
+
474
+ x = self.generator(x, s, F0_curve)
475
+ return x
476
+
477
+
Modules/utils.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def init_weights(m, mean=0.0, std=0.01):
2
+ classname = m.__class__.__name__
3
+ if classname.find("Conv") != -1:
4
+ m.weight.data.normal_(mean, std)
5
+
6
+
7
+ def apply_weight_norm(m):
8
+ classname = m.__class__.__name__
9
+ if classname.find("Conv") != -1:
10
+ weight_norm(m)
11
+
12
+
13
+ def get_padding(kernel_size, dilation=1):
14
+ return int((kernel_size*dilation - dilation)/2)
README.md CHANGED
@@ -1,3 +1,131 @@
 
1
  ---
2
  license: cc-by-nc-sa-4.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <<<<<<< HEAD
2
  ---
3
  license: cc-by-nc-sa-4.0
4
  ---
5
+ =======
6
+ # StyleTTS 2: Towards Human-Level Text-to-Speech through Style Diffusion and Adversarial Training with Large Speech Language Models
7
+
8
+ ### Yinghao Aaron Li, Cong Han, Vinay S. Raghavan, Gavin Mischler, Nima Mesgarani
9
+
10
+ > In this paper, we present StyleTTS 2, a text-to-speech (TTS) model that leverages style diffusion and adversarial training with large speech language models (SLMs) to achieve human-level TTS synthesis. StyleTTS 2 differs from its predecessor by modeling styles as a latent random variable through diffusion models to generate the most suitable style for the text without requiring reference speech, achieving efficient latent diffusion while benefiting from the diverse speech synthesis offered by diffusion models. Furthermore, we employ large pre-trained SLMs, such as WavLM, as discriminators with our novel differentiable duration modeling for end-to-end training, resulting in improved speech naturalness. StyleTTS 2 surpasses human recordings on the single-speaker LJSpeech dataset and matches it on the multispeaker VCTK dataset as judged by native English speakers. Moreover, when trained on the LibriTTS dataset, our model outperforms previous publicly available models for zero-shot speaker adaptation. This work achieves the first human-level TTS synthesis on both single and multispeaker datasets, showcasing the potential of style diffusion and adversarial training with large SLMs.
11
+
12
+ Paper: [https://arxiv.org/abs/2306.07691](https://arxiv.org/abs/2306.07691)
13
+
14
+ Audio samples: [https://styletts2.github.io/](https://styletts2.github.io/)
15
+
16
+ Online demo: [Hugging Face](https://huggingface.co/spaces/styletts2/styletts2) (thank [@fakerybakery](https://github.com/fakerybakery) for the wonderful online demo)
17
+
18
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yl4579/StyleTTS2/blob/main/) [![Discord](https://img.shields.io/discord/1197679063150637117?logo=discord&logoColor=white&label=Join%20our%20Community)](https://discord.gg/ha8sxdG2K4)
19
+
20
+ ## TODO
21
+ - [x] Training and inference demo code for single-speaker models (LJSpeech)
22
+ - [x] Test training code for multi-speaker models (VCTK and LibriTTS)
23
+ - [x] Finish demo code for multispeaker model and upload pre-trained models
24
+ - [x] Add a finetuning script for new speakers with base pre-trained multispeaker models
25
+ - [ ] Fix DDP (accelerator) for `train_second.py` **(I have tried everything I could to fix this but had no success, so if you are willing to help, please see [#7](https://github.com/yl4579/StyleTTS2/issues/7))**
26
+
27
+ ## Pre-requisites
28
+ 1. Python >= 3.7
29
+ 2. Clone this repository:
30
+ ```bash
31
+ git clone https://github.com/yl4579/StyleTTS2.git
32
+ cd StyleTTS2
33
+ ```
34
+ 3. Install python requirements:
35
+ ```bash
36
+ pip install -r requirements.txt
37
+ ```
38
+ On Windows add:
39
+ ```bash
40
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 -U
41
+ ```
42
+ Also install phonemizer and espeak if you want to run the demo:
43
+ ```bash
44
+ pip install phonemizer
45
+ sudo apt-get install espeak-ng
46
+ ```
47
+ 4. Download and extract the [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/), unzip to the data folder and upsample the data to 24 kHz. The text aligner and pitch extractor are pre-trained on 24 kHz data, but you can easily change the preprocessing and re-train them using your own preprocessing.
48
+ For LibriTTS, you will need to combine train-clean-360 with train-clean-100 and rename the folder train-clean-460 (see [val_list_libritts.txt](https://github.com/yl4579/StyleTTS/blob/main/Data/val_list_libritts.txt) as an example).
49
+
50
+ ## Training
51
+ First stage training:
52
+ ```bash
53
+ accelerate launch train_first.py --config_path ./Configs/config.yml
54
+ ```
55
+ Second stage training **(DDP version not working, so the current version uses DP, again see [#7](https://github.com/yl4579/StyleTTS2/issues/7) if you want to help)**:
56
+ ```bash
57
+ python train_second.py --config_path ./Configs/config.yml
58
+ ```
59
+ You can run both consecutively and it will train both the first and second stages. The model will be saved in the format "epoch_1st_%05d.pth" and "epoch_2nd_%05d.pth". Checkpoints and Tensorboard logs will be saved at `log_dir`.
60
+
61
+ The data list format needs to be `filename.wav|transcription|speaker`, see [val_list.txt](https://github.com/yl4579/StyleTTS2/blob/main/Data/val_list.txt) as an example. The speaker labels are needed for multi-speaker models because we need to sample reference audio for style diffusion model training.
62
+
63
+ ### Important Configurations
64
+ In [config.yml](https://github.com/yl4579/StyleTTS2/blob/main/Configs/config.yml), there are a few important configurations to take care of:
65
+ - `OOD_data`: The path for out-of-distribution texts for SLM adversarial training. The format should be `text|anything`.
66
+ - `min_length`: Minimum length of OOD texts for training. This is to make sure the synthesized speech has a minimum length.
67
+ - `max_len`: Maximum length of audio for training. The unit is frame. Since the default hop size is 300, one frame is approximately `300 / 24000` (0.0125) second. Lowering this if you encounter the out-of-memory issue.
68
+ - `multispeaker`: Set to true if you want to train a multispeaker model. This is needed because the architecture of the denoiser is different for single and multispeaker models.
69
+ - `batch_percentage`: This is to make sure during SLM adversarial training there are no out-of-memory (OOM) issues. If you encounter OOM problem, please set a lower number for this.
70
+
71
+ ### Pre-trained modules
72
+ In [Utils](https://github.com/yl4579/StyleTTS2/tree/main/Utils) folder, there are three pre-trained models:
73
+ - **[ASR](https://github.com/yl4579/StyleTTS2/tree/main/Utils/ASR) folder**: It contains the pre-trained text aligner, which was pre-trained on English (LibriTTS), Japanese (JVS), and Chinese (AiShell) corpus. It works well for most other languages without fine-tuning, but you can always train your own text aligner with the code here: [yl4579/AuxiliaryASR](https://github.com/yl4579/AuxiliaryASR).
74
+ - **[JDC](https://github.com/yl4579/StyleTTS2/tree/main/Utils/JDC) folder**: It contains the pre-trained pitch extractor, which was pre-trained on English (LibriTTS) corpus only. However, it works well for other languages too because F0 is independent of language. If you want to train on singing corpus, it is recommended to train a new pitch extractor with the code here: [yl4579/PitchExtractor](https://github.com/yl4579/PitchExtractor).
75
+ - **[PLBERT](https://github.com/yl4579/StyleTTS2/tree/main/Utils/PLBERT) folder**: It contains the pre-trained [PL-BERT](https://arxiv.org/abs/2301.08810) model, which was pre-trained on English (Wikipedia) corpus only. It probably does not work very well on other languages, so you will need to train a different PL-BERT for different languages using the repo here: [yl4579/PL-BERT](https://github.com/yl4579/PL-BERT). You can also use the [multilingual PL-BERT](https://huggingface.co/papercup-ai/multilingual-pl-bert) which supports 14 languages.
76
+
77
+ ### Common Issues
78
+ - **Loss becomes NaN**: If it is the first stage, please make sure you do not use mixed precision, as it can cause loss becoming NaN for some particular datasets when the batch size is not set properly (need to be more than 16 to work well). For the second stage, please also experiment with different batch sizes, with higher batch sizes being more likely to cause NaN loss values. We recommend the batch size to be 16. You can refer to issues [#10](https://github.com/yl4579/StyleTTS2/issues/10) and [#11](https://github.com/yl4579/StyleTTS2/issues/11) for more details.
79
+ - **Out of memory**: Please either use lower `batch_size` or `max_len`. You may refer to issue [#10](https://github.com/yl4579/StyleTTS2/issues/10) for more information.
80
+ - **Non-English dataset**: You can train on any language you want, but you will need to use a pre-trained PL-BERT model for that language. We have a pre-trained [multilingual PL-BERT](https://huggingface.co/papercup-ai/multilingual-pl-bert) that supports 14 languages. You may refer to [yl4579/StyleTTS#10](https://github.com/yl4579/StyleTTS/issues/10) and [#70](https://github.com/yl4579/StyleTTS2/issues/70) for some examples to train on Chinese datasets.
81
+
82
+ ## Finetuning
83
+ The script is modified from `train_second.py` which uses DP, as DDP does not work for `train_second.py`. Please see the bold section above if you are willing to help with this problem.
84
+ ```bash
85
+ python train_finetune.py --config_path ./Configs/config_ft.yml
86
+ ```
87
+ Please make sure you have the LibriTTS checkpoint downloaded and unzipped under the folder. The default configuration `config_ft.yml` finetunes on LJSpeech with 1 hour of speech data (around 1k samples) for 50 epochs. This took about 4 hours to finish on four NVidia A100. The quality is slightly worse (similar to NaturalSpeech on LJSpeech) than LJSpeech model trained from scratch with 24 hours of speech data, which took around 2.5 days to finish on four A100. The samples can be found at [#65 (comment)](https://github.com/yl4579/StyleTTS2/discussions/65#discussioncomment-7668393).
88
+
89
+ If you are using a **single GPU** (because the script doesn't work with DDP) and want to save training speed and VRAM, you can do (thank [@korakoe](https://github.com/korakoe) for making the script at [#100](https://github.com/yl4579/StyleTTS2/pull/100)):
90
+ ```bash
91
+ accelerate launch --mixed_precision=fp16 --num_processes=1 train_finetune_accelerate.py --config_path ./Configs/config_ft.yml
92
+ ```
93
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yl4579/StyleTTS2/blob/main/Colab/StyleTTS2_Finetune_Demo.ipynb)
94
+
95
+ ### Common Issues
96
+ [@Kreevoz](https://github.com/Kreevoz) has made detailed notes on common issues in finetuning, with suggestions in maximizing audio quality: [#81](https://github.com/yl4579/StyleTTS2/discussions/81). Some of these also apply to training from scratch. [@IIEleven11](https://github.com/IIEleven11) has also made a guideline for fine-tuning: [#128](https://github.com/yl4579/StyleTTS2/discussions/128).
97
+
98
+ - **Out of memory after `joint_epoch`**: This is likely because your GPU RAM is not big enough for SLM adversarial training run. You may skip that but the quality could be worse. Setting `joint_epoch` a larger number than `epochs` could skip the SLM advesariral training.
99
+
100
+ ## Inference
101
+ Please refer to [Inference_LJSpeech.ipynb](https://github.com/yl4579/StyleTTS2/blob/main/Demo/Inference_LJSpeech.ipynb) (single-speaker) and [Inference_LibriTTS.ipynb](https://github.com/yl4579/StyleTTS2/blob/main/Demo/Inference_LibriTTS.ipynb) (multi-speaker) for details. For LibriTTS, you will also need to download [reference_audio.zip](https://huggingface.co/yl4579/StyleTTS2-LibriTTS/resolve/main/reference_audio.zip) and unzip it under the `demo` before running the demo.
102
+
103
+ - The pretrained StyleTTS 2 on LJSpeech corpus in 24 kHz can be downloaded at [https://huggingface.co/yl4579/StyleTTS2-LJSpeech/tree/main](https://huggingface.co/yl4579/StyleTTS2-LJSpeech/tree/main).
104
+
105
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yl4579/StyleTTS2/blob/main/Colab/StyleTTS2_Demo_LJSpeech.ipynb)
106
+
107
+ - The pretrained StyleTTS 2 model on LibriTTS can be downloaded at [https://huggingface.co/yl4579/StyleTTS2-LibriTTS/tree/main](https://huggingface.co/yl4579/StyleTTS2-LibriTTS/tree/main).
108
+
109
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yl4579/StyleTTS2/blob/main/Colab/StyleTTS2_Demo_LibriTTS.ipynb)
110
+
111
+
112
+ You can import StyleTTS 2 and run it in your own code. However, the inference depends on a GPL-licensed package, so it is not included directly in this repository. A [GPL-licensed fork](https://github.com/NeuralVox/StyleTTS2) has an importable script, as well as an experimental streaming API, etc. A [fully MIT-licensed package](https://pypi.org/project/styletts2/) that uses gruut (albeit lower quality due to mismatch between phonemizer and gruut) is also available.
113
+
114
+ ***Before using these pre-trained models, you agree to inform the listeners that the speech samples are synthesized by the pre-trained models, unless you have the permission to use the voice you synthesize. That is, you agree to only use voices whose speakers grant the permission to have their voice cloned, either directly or by license before making synthesized voices public, or you have to publicly announce that these voices are synthesized if you do not have the permission to use these voices.***
115
+
116
+ ### Common Issues
117
+ - **High-pitched background noise**: This is caused by numerical float differences in older GPUs. For more details, please refer to issue [#13](https://github.com/yl4579/StyleTTS2/issues/13). Basically, you will need to use more modern GPUs or do inference on CPUs.
118
+ - **Pre-trained model license**: You only need to abide by the above rules if you use **the pre-trained models** and the voices are **NOT** in the training set, i.e., your reference speakers are not from any open access dataset. For more details of rules to use the pre-trained models, please see [#37](https://github.com/yl4579/StyleTTS2/issues/37).
119
+
120
+ ## References
121
+ - [archinetai/audio-diffusion-pytorch](https://github.com/archinetai/audio-diffusion-pytorch)
122
+ - [jik876/hifi-gan](https://github.com/jik876/hifi-gan)
123
+ - [rishikksh20/iSTFTNet-pytorch](https://github.com/rishikksh20/iSTFTNet-pytorch)
124
+ - [nii-yamagishilab/project-NN-Pytorch-scripts/project/01-nsf](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts/tree/master/project/01-nsf)
125
+
126
+ ## License
127
+
128
+ Code: MIT License
129
+
130
+ Pre-Trained Models: Before using these pre-trained models, you agree to inform the listeners that the speech samples are synthesized by the pre-trained models, unless you have the permission to use the voice you synthesize. That is, you agree to only use voices whose speakers grant the permission to have their voice cloned, either directly or by license before making synthesized voices public, or you have to publicly announce that these voices are synthesized if you do not have the permission to use these voices.
131
+ >>>>>>> 062910b (first commit)
inference.py ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+
4
+ import yaml
5
+ from munch import Munch
6
+ import unicodedata
7
+ import re
8
+ import torchaudio
9
+
10
+ from nltk.tokenize import word_tokenize
11
+ import nltk
12
+ nltk.download('punkt_tab')
13
+
14
+ import librosa
15
+ import noisereduce as nr
16
+
17
+ from models import ProsodyPredictor, TextEncoder, StyleEncoder
18
+ from Modules.hifigan import Decoder
19
+
20
+
21
+ import phonemizer
22
+
23
+ # For windows bro
24
+ from phonemizer.backend.espeak.wrapper import EspeakWrapper
25
+ import espeakng_loader
26
+ EspeakWrapper.set_library(espeakng_loader.get_library_path())
27
+
28
+ def espeak_phn(text, lang):
29
+ try:
30
+ my_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True, with_stress=True, language_switch='remove-flags')
31
+ return my_phonemizer.phonemize([text])[0]
32
+ except Exception as e:
33
+ print(e)
34
+
35
+ # IPA Phonemizer: https://github.com/bootphon/phonemizer
36
+ # Total including extend chars 187
37
+
38
+ _pad = "$"
39
+ _punctuation = ';:,.!?¡¿—…"«»“” '
40
+ _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
41
+ _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
42
+ _extend = "∫̆ăη͡123456"
43
+
44
+ # Export all symbols:
45
+ symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) + list(_extend)
46
+
47
+ dicts = {}
48
+ for i in range(len((symbols))):
49
+ dicts[symbols[i]] = i
50
+
51
+ class TextCleaner:
52
+ def __init__(self, dummy=None):
53
+ self.word_index_dictionary = dicts
54
+ #print(len(dicts))
55
+ def __call__(self, text):
56
+ indexes = []
57
+ for char in text:
58
+ try:
59
+ indexes.append(self.word_index_dictionary[char])
60
+ except KeyError as e:
61
+ #print(char)
62
+ continue
63
+ return indexes
64
+
65
+ class Preprocess:
66
+ def __text_normalize(self, text):
67
+ punctuation = [",", "、", "،", ";", "(", ".", "。", "…", "!", "–", ":"]
68
+ map_to = "."
69
+ punctuation_pattern = re.compile(f"[{''.join(re.escape(p) for p in punctuation)}]")
70
+ #ensure consistency.
71
+ text = unicodedata.normalize('NFKC', text)
72
+ #replace punctuation that acts like a comma or period
73
+ #text = re.sub(r'\.{2,}', '.', text)
74
+ text = punctuation_pattern.sub(map_to, text)
75
+ #remove or replace special chars except . , { } ? ' - \ % $ & /
76
+ text = re.sub(r'[^\w\s.,{}?\'\-\[\]\%\$\&\/]', ' ', text)
77
+ #replace consecutive whitespace chars with a single space and strip leading/trailing spaces
78
+ text = re.sub(r'\s+', ' ', text).strip()
79
+ return text
80
+ def __merge_fragments(self, texts, n):
81
+ merged = []
82
+ i = 0
83
+ while i < len(texts):
84
+ fragment = texts[i]
85
+ j = i + 1
86
+ while len(fragment.split()) < n and j < len(texts):
87
+ fragment += ", " + texts[j]
88
+ j += 1
89
+ merged.append(fragment)
90
+ i = j
91
+ if len(merged[-1].split()) < n and len(merged) > 1: #handle last sentence
92
+ merged[-2] = merged[-2] + ", " + merged[-1]
93
+ del merged[-1]
94
+ else:
95
+ merged[-1] = merged[-1]
96
+ return merged
97
+ def wave_preprocess(self, wave):
98
+ to_mel = torchaudio.transforms.MelSpectrogram(n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
99
+ mean, std = -4, 4
100
+ wave_tensor = torch.from_numpy(wave).float()
101
+ mel_tensor = to_mel(wave_tensor)
102
+ mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
103
+ return mel_tensor
104
+ def text_preprocess(self, text, n_merge=12):
105
+ text_norm = self.__text_normalize(text).replace(",", ".").split(".")#split.
106
+ text_norm = [s.strip() for s in text_norm]
107
+ text_norm = list(filter(lambda x: x != '', text_norm)) #filter empty index
108
+ text_norm = self.__merge_fragments(text_norm, n=n_merge) #merge if a sentence has less that n
109
+ return text_norm
110
+ def length_to_mask(self, lengths):
111
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
112
+ mask = torch.gt(mask+1, lengths.unsqueeze(1))
113
+ return mask
114
+
115
+ #For inference only
116
+ class StyleTTS2(torch.nn.Module):
117
+ def __init__(self, config_path, models_path):
118
+ super().__init__()
119
+ self.register_buffer("get_device", torch.empty(0))
120
+ self.preprocess = Preprocess()
121
+
122
+ config = yaml.safe_load(open(config_path))
123
+ args = self.__recursive_munch(config['model_params'])
124
+
125
+ assert args.decoder.type in ['hifigan'], 'Decoder type unknown'
126
+
127
+ self.decoder = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels,
128
+ resblock_kernel_sizes = args.decoder.resblock_kernel_sizes,
129
+ upsample_rates = args.decoder.upsample_rates,
130
+ upsample_initial_channel=args.decoder.upsample_initial_channel,
131
+ resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
132
+ upsample_kernel_sizes=args.decoder.upsample_kernel_sizes)
133
+ self.predictor = ProsodyPredictor(style_dim=args.style_dim, d_hid=args.hidden_dim, nlayers=args.n_layer, max_dur=args.max_dur, dropout=args.dropout)
134
+ self.text_encoder = TextEncoder(channels=args.hidden_dim, kernel_size=5, depth=args.n_layer, n_symbols=args.n_token)
135
+ self.style_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim)# acoustic style encoder
136
+
137
+ self.__load_models(models_path)
138
+
139
+ self.ref_s_speakers = None
140
+ self.speakers = None
141
+
142
+ def __recursive_munch(self, d):
143
+ if isinstance(d, dict):
144
+ return Munch((k, self.__recursive_munch(v)) for k, v in d.items())
145
+ elif isinstance(d, list):
146
+ return [self.__recursive_munch(v) for v in d]
147
+ else:
148
+ return d
149
+
150
+ def __init_replacement_func(self, replacements):
151
+ replacement_iter = iter(replacements)
152
+ def replacement(match):
153
+ return next(replacement_iter)
154
+ return replacement
155
+
156
+ def __load_models(self, models_path):
157
+ module_params = []
158
+ model = {'decoder':self.decoder, 'predictor':self.predictor, 'text_encoder':self.text_encoder, 'style_encoder':self.style_encoder}
159
+
160
+ params_whole = torch.load(models_path, map_location='cpu')
161
+ params = params_whole['net']
162
+ params = {key: value for key, value in params.items() if key in model.keys()}
163
+
164
+ for key in model:
165
+ try:
166
+ model[key].load_state_dict(params[key])
167
+ except:
168
+ from collections import OrderedDict
169
+ state_dict = params[key]
170
+ new_state_dict = OrderedDict()
171
+ for k, v in state_dict.items():
172
+ name = k[7:] # remove `module.`
173
+ new_state_dict[name] = v
174
+ model[key].load_state_dict(new_state_dict, strict=False)
175
+
176
+ total_params = sum(p.numel() for p in model[key].parameters())
177
+ print(key,":",total_params)
178
+ module_params.append(total_params)
179
+
180
+ print('\nTotal',":",sum(module_params))
181
+
182
+ def __compute_style(self, path, denoise, split_dur):
183
+ device = self.get_device.device
184
+ denoise = min(denoise, 1)
185
+ if split_dur != 0: split_dur = max(int(split_dur), 1)
186
+ max_samples = 24000*30 #max 30 seconds ref audio
187
+ print("Computing the style for:", path)
188
+
189
+ wave, sr = librosa.load(path, sr=24000)
190
+ audio, index = librosa.effects.trim(wave, top_db=30)
191
+ if sr != 24000:
192
+ audio = librosa.resample(audio, sr, 24000)
193
+ if len(audio) > max_samples:
194
+ audio = audio[:max_samples]
195
+
196
+ if denoise > 0.0:
197
+ audio_denoise = nr.reduce_noise(y=audio, sr=sr, n_fft=2048, win_length=1200, hop_length=300)
198
+ audio = audio*(1-denoise) + audio_denoise*denoise
199
+
200
+ with torch.no_grad():
201
+ if split_dur>0 and len(audio)/sr>split_dur:
202
+ #This option will split the ref audio to multiple parts, calculate styles and average them
203
+ count = 0
204
+ ref_s = None
205
+ jump = sr*split_dur
206
+ total_len = len(audio)
207
+
208
+ #Need to init before the loop
209
+ mel_tensor = self.preprocess.wave_preprocess(audio[0:jump]).to(device)
210
+ ref_s = self.style_encoder(mel_tensor.unsqueeze(1))
211
+ count += 1
212
+ for i in range(jump, total_len, jump):
213
+ if i+jump >= total_len:
214
+ left_dur = (total_len-i)/sr
215
+ if left_dur >= 0.5: #Still count if left over dur is >= 0.5s
216
+ mel_tensor = self.preprocess.wave_preprocess(audio[i:total_len]).to(device)
217
+ ref_s += self.style_encoder(mel_tensor.unsqueeze(1))
218
+ count += 1
219
+ continue
220
+ mel_tensor = self.preprocess.wave_preprocess(audio[i:i+jump]).to(device)
221
+ ref_s += self.style_encoder(mel_tensor.unsqueeze(1))
222
+ count += 1
223
+ ref_s /= count
224
+ else:
225
+ mel_tensor = self.preprocess.wave_preprocess(audio).to(device)
226
+ ref_s = self.style_encoder(mel_tensor.unsqueeze(1))
227
+
228
+ return ref_s
229
+
230
+ def __inference(self, phonem, ref_s, speed=1, prev_d_mean=0, t=0.1):
231
+ device = self.get_device.device
232
+ speed = min(max(speed, 0.0001), 2) #speed range [0, 2]
233
+
234
+ phonem = ' '.join(word_tokenize(phonem))
235
+ tokens = TextCleaner()(phonem)
236
+ tokens.insert(0, 0)
237
+ tokens.append(0)
238
+ tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
239
+
240
+ with torch.no_grad():
241
+ input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
242
+ text_mask = self.preprocess.length_to_mask(input_lengths).to(device)
243
+
244
+ # encode
245
+ t_en = self.text_encoder(tokens, input_lengths, text_mask)
246
+ s = ref_s.to(device)
247
+
248
+ # cal alignment
249
+ d = self.predictor.text_encoder(t_en, s, input_lengths, text_mask)
250
+ x, _ = self.predictor.lstm(d)
251
+ duration = self.predictor.duration_proj(x) / speed
252
+ duration = torch.sigmoid(duration).sum(axis=-1)
253
+
254
+ if prev_d_mean != 0:#Stabilize speaking speed
255
+ dur_stats = torch.empty(duration.shape).normal_(mean=prev_d_mean, std=duration.std()).to(device)
256
+ else:
257
+ dur_stats = torch.empty(duration.shape).normal_(mean=duration.mean(), std=duration.std()).to(device)
258
+ duration = duration*(1-t) + dur_stats*t
259
+
260
+ pred_dur = torch.round(duration.squeeze()).clamp(min=1)
261
+ pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
262
+ c_frame = 0
263
+ for i in range(pred_aln_trg.size(0)):
264
+ pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
265
+ c_frame += int(pred_dur[i].data)
266
+ alignment = pred_aln_trg.unsqueeze(0).to(device)
267
+
268
+ # encode prosody
269
+ en = (d.transpose(-1, -2) @ alignment)
270
+ F0_pred, N_pred = self.predictor.F0Ntrain(en, s)
271
+ asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
272
+
273
+ out = self.decoder(asr, F0_pred, N_pred, s)
274
+
275
+ return out.squeeze().cpu().numpy(), duration.mean()
276
+
277
+ def __get_styles(self, speakers, denoise, split_dur):
278
+ self.ref_s_speakers = {}
279
+ self.speakers = speakers
280
+ for id in speakers:
281
+ ref_s = self.__compute_style(speakers[id]['path'], denoise=denoise, split_dur=split_dur)
282
+ self.ref_s_speakers[id] = ref_s
283
+
284
+ def generate(self, text, speakers, avg_style=False, stabilize=False, denoise=0.3, n_merge=14, default_speaker= "[id_1]"):
285
+ if avg_style: split_dur = 3
286
+ else: split_dur = 0
287
+
288
+ if stabilize: smooth_dur=0.2
289
+ else: smooth_dur=0
290
+
291
+ self.__get_styles(speakers, denoise, split_dur)
292
+
293
+ list_wav = []
294
+ prev_d_mean = 0
295
+ lang_pattern = r'\[([^\]]+)\]\{([^}]+)\}'
296
+
297
+ text = re.sub(r'[\n\r\t\f\v]', '', text)
298
+ #fix lang tokens span to multiple sents
299
+ find_lang_tokens = re.findall(lang_pattern, text)
300
+ if find_lang_tokens:
301
+ cus_text = []
302
+ for lang, t in find_lang_tokens:
303
+ parts = self.preprocess.text_preprocess(t, n_merge=0)
304
+ parts = ".".join([f"[{lang}]" + f"{{{p}}}"for p in parts])
305
+ cus_text.append(parts)
306
+ replacement_func = self.__init_replacement_func(cus_text)
307
+ text = re.sub(lang_pattern, replacement_func, text)
308
+
309
+ texts = re.split(r'(\[id_\d+\])', text) #split the text by speaker ids while keeping the ids.
310
+ if len(texts) <= 1:
311
+ texts.insert(0, default_speaker)
312
+ texts = list(filter(lambda x: x != '', texts))
313
+
314
+ print("Generating Audio...")
315
+ for i in texts:
316
+ if bool(re.match(r'(\[id_\d+\])', i)):
317
+ #Set up env for matched speaker
318
+ speaker_id = i.strip('[]')
319
+ current_ref_s = self.ref_s_speakers[speaker_id]
320
+ speed = self.speakers[speaker_id]['speed']
321
+ continue
322
+ text_norm = self.preprocess.text_preprocess(i, n_merge=n_merge)
323
+ for sentence in text_norm:
324
+ cus_phonem = []
325
+ find_lang_tokens = re.findall(lang_pattern, sentence)
326
+ if find_lang_tokens:
327
+ for lang, t in find_lang_tokens:
328
+ try:
329
+ phonem = espeak_phn(t, lang)
330
+ cus_phonem.append(phonem)
331
+ except Exception as e:
332
+ print(e)
333
+
334
+ replacement_func = self.__init_replacement_func(cus_phonem)
335
+ phonem = espeak_phn(sentence, self.speakers[speaker_id]['lang'])
336
+ phonem = re.sub(lang_pattern, replacement_func, phonem)
337
+
338
+ wav, prev_d_mean = self.__inference(phonem, current_ref_s, speed=speed, prev_d_mean=prev_d_mean, t=smooth_dur)
339
+ wav = wav[4000:-4000] #Remove weird pulse and silent tokens
340
+ list_wav.append(wav)
341
+
342
+ final_wav = np.concatenate(list_wav)
343
+ final_wav = np.concatenate([np.zeros([12000]), final_wav, np.zeros([12000])], axis=0) # 0.5 second padding
344
+ return final_wav
models.py ADDED
@@ -0,0 +1,532 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from torch.nn.utils import weight_norm
6
+
7
+ from munch import Munch
8
+
9
+ class LearnedDownSample(nn.Module):
10
+ def __init__(self, layer_type, dim_in):
11
+ super().__init__()
12
+ self.layer_type = layer_type
13
+
14
+ if self.layer_type == 'none':
15
+ self.conv = nn.Identity()
16
+ elif self.layer_type == 'timepreserve':
17
+ self.conv = nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0))
18
+ elif self.layer_type == 'half':
19
+ self.conv = nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1)
20
+ else:
21
+ raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
22
+
23
+ def forward(self, x):
24
+ return self.conv(x)
25
+
26
+ class LearnedUpSample(nn.Module):
27
+ def __init__(self, layer_type, dim_in):
28
+ super().__init__()
29
+ self.layer_type = layer_type
30
+
31
+ if self.layer_type == 'none':
32
+ self.conv = nn.Identity()
33
+ elif self.layer_type == 'timepreserve':
34
+ self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0))
35
+ elif self.layer_type == 'half':
36
+ self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1)
37
+ else:
38
+ raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
39
+
40
+
41
+ def forward(self, x):
42
+ return self.conv(x)
43
+
44
+ class DownSample(nn.Module):
45
+ def __init__(self, layer_type):
46
+ super().__init__()
47
+ self.layer_type = layer_type
48
+
49
+ def forward(self, x):
50
+ if self.layer_type == 'none':
51
+ return x
52
+ elif self.layer_type == 'timepreserve':
53
+ return F.avg_pool2d(x, (2, 1))
54
+ elif self.layer_type == 'half':
55
+ if x.shape[-1] % 2 != 0:
56
+ x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
57
+ return F.avg_pool2d(x, 2)
58
+ else:
59
+ raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
60
+
61
+
62
+ class UpSample(nn.Module):
63
+ def __init__(self, layer_type):
64
+ super().__init__()
65
+ self.layer_type = layer_type
66
+
67
+ def forward(self, x):
68
+ if self.layer_type == 'none':
69
+ return x
70
+ elif self.layer_type == 'timepreserve':
71
+ return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
72
+ elif self.layer_type == 'half':
73
+ return F.interpolate(x, scale_factor=2, mode='nearest')
74
+ else:
75
+ raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
76
+
77
+
78
+ class ResBlk(nn.Module):
79
+ def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
80
+ normalize=False, downsample='none'):
81
+ super().__init__()
82
+ self.actv = actv
83
+ self.normalize = normalize
84
+ self.downsample = DownSample(downsample)
85
+ self.downsample_res = LearnedDownSample(downsample, dim_in)
86
+ self.learned_sc = dim_in != dim_out
87
+ self._build_weights(dim_in, dim_out)
88
+
89
+ def _build_weights(self, dim_in, dim_out):
90
+ self.conv1 = nn.Conv2d(dim_in, dim_in, 3, 1, 1)
91
+ self.conv2 = nn.Conv2d(dim_in, dim_out, 3, 1, 1)
92
+ if self.normalize:
93
+ self.norm1 = nn.InstanceNorm2d(dim_in, affine=True)
94
+ self.norm2 = nn.InstanceNorm2d(dim_in, affine=True)
95
+ if self.learned_sc:
96
+ self.conv1x1 = nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False)
97
+
98
+ def _shortcut(self, x):
99
+ if self.learned_sc:
100
+ x = self.conv1x1(x)
101
+ if self.downsample:
102
+ x = self.downsample(x)
103
+ return x
104
+
105
+ def _residual(self, x):
106
+ if self.normalize:
107
+ x = self.norm1(x)
108
+ x = self.actv(x)
109
+ x = self.conv1(x)
110
+ x = self.downsample_res(x)
111
+ if self.normalize:
112
+ x = self.norm2(x)
113
+ x = self.actv(x)
114
+ x = self.conv2(x)
115
+ return x
116
+
117
+ def forward(self, x):
118
+ x = self._shortcut(x) + self._residual(x)
119
+ return x / math.sqrt(2) # unit variance
120
+
121
+ class StyleEncoder(nn.Module):
122
+ def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
123
+ super().__init__()
124
+ blocks = []
125
+ blocks += [nn.Conv2d(1, dim_in, 3, 1, 1)]
126
+
127
+ repeat_num = 4
128
+ for _ in range(repeat_num):
129
+ dim_out = min(dim_in*2, max_conv_dim)
130
+ blocks += [ResBlk(dim_in, dim_out, downsample='half')]
131
+ dim_in = dim_out
132
+
133
+ blocks += [nn.LeakyReLU(0.2)]
134
+ blocks += [nn.Conv2d(dim_out, dim_out, 5, 1, 0)]
135
+ blocks += [nn.AdaptiveAvgPool2d(1)]
136
+ blocks += [nn.LeakyReLU(0.2)]
137
+ self.shared = nn.Sequential(*blocks)
138
+
139
+ self.unshared = nn.Linear(dim_out, style_dim)
140
+
141
+ def forward(self, x):
142
+ h = self.shared(x)
143
+ h = h.view(h.size(0), -1)
144
+ s = self.unshared(h)
145
+
146
+ return s
147
+
148
+ class LinearNorm(torch.nn.Module):
149
+ def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
150
+ super(LinearNorm, self).__init__()
151
+ self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
152
+
153
+ torch.nn.init.xavier_uniform_(
154
+ self.linear_layer.weight,
155
+ gain=torch.nn.init.calculate_gain(w_init_gain))
156
+
157
+ def forward(self, x):
158
+ return self.linear_layer(x)
159
+
160
+ class ResBlk1d(nn.Module):
161
+ def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
162
+ normalize=False, downsample='none', dropout_p=0.2):
163
+ super().__init__()
164
+ self.actv = actv
165
+ self.normalize = normalize
166
+ self.downsample_type = downsample
167
+ self.learned_sc = dim_in != dim_out
168
+ self._build_weights(dim_in, dim_out)
169
+ self.dropout_p = dropout_p
170
+
171
+ if self.downsample_type == 'none':
172
+ self.pool = nn.Identity()
173
+ else:
174
+ self.pool = weight_norm(nn.Conv1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1))
175
+
176
+ def _build_weights(self, dim_in, dim_out):
177
+ self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_in, 3, 1, 1))
178
+ self.conv2 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
179
+ if self.normalize:
180
+ self.norm1 = nn.InstanceNorm1d(dim_in, affine=True)
181
+ self.norm2 = nn.InstanceNorm1d(dim_in, affine=True)
182
+ if self.learned_sc:
183
+ self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
184
+
185
+ def downsample(self, x):
186
+ if self.downsample_type == 'none':
187
+ return x
188
+ else:
189
+ if x.shape[-1] % 2 != 0:
190
+ x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
191
+ return F.avg_pool1d(x, 2)
192
+
193
+ def _shortcut(self, x):
194
+ if self.learned_sc:
195
+ x = self.conv1x1(x)
196
+ x = self.downsample(x)
197
+ return x
198
+
199
+ def _residual(self, x):
200
+ if self.normalize:
201
+ x = self.norm1(x)
202
+ x = self.actv(x)
203
+ x = F.dropout(x, p=self.dropout_p, training=self.training)
204
+
205
+ x = self.conv1(x)
206
+ x = self.pool(x)
207
+ if self.normalize:
208
+ x = self.norm2(x)
209
+
210
+ x = self.actv(x)
211
+ x = F.dropout(x, p=self.dropout_p, training=self.training)
212
+
213
+ x = self.conv2(x)
214
+ return x
215
+
216
+ def forward(self, x):
217
+ x = self._shortcut(x) + self._residual(x)
218
+ return x / math.sqrt(2) # unit variance
219
+
220
+ class LayerNorm(nn.Module):
221
+ def __init__(self, channels, eps=1e-5):
222
+ super().__init__()
223
+ self.channels = channels
224
+ self.eps = eps
225
+
226
+ self.gamma = nn.Parameter(torch.ones(channels))
227
+ self.beta = nn.Parameter(torch.zeros(channels))
228
+
229
+ def forward(self, x):
230
+ x = x.transpose(1, -1)
231
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
232
+ return x.transpose(1, -1)
233
+
234
+ class TextEncoder(nn.Module):
235
+ def __init__(self, channels, kernel_size, depth, n_symbols, actv=nn.LeakyReLU(0.2)):
236
+ super().__init__()
237
+ self.embedding = nn.Embedding(n_symbols, channels)
238
+
239
+ padding = (kernel_size - 1) // 2
240
+ self.cnn = nn.ModuleList()
241
+ for _ in range(depth):
242
+ self.cnn.append(nn.Sequential(
243
+ weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)),
244
+ LayerNorm(channels),
245
+ actv,
246
+ nn.Dropout(0.2),
247
+ ))
248
+ # self.cnn = nn.Sequential(*self.cnn)
249
+
250
+ self.lstm = nn.LSTM(channels, channels//2, 1, batch_first=True, bidirectional=True)
251
+
252
+ def forward(self, x, input_lengths, m):
253
+ x = self.embedding(x) # [B, T, emb]
254
+ x = x.transpose(1, 2) # [B, emb, T]
255
+ m = m.to(input_lengths.device).unsqueeze(1)
256
+ x.masked_fill_(m, 0.0)
257
+
258
+ for c in self.cnn:
259
+ x = c(x)
260
+ x.masked_fill_(m, 0.0)
261
+
262
+ x = x.transpose(1, 2) # [B, T, chn]
263
+
264
+ input_lengths = input_lengths.cpu().numpy()
265
+ x = nn.utils.rnn.pack_padded_sequence(
266
+ x, input_lengths, batch_first=True, enforce_sorted=False)
267
+
268
+ self.lstm.flatten_parameters()
269
+ x, _ = self.lstm(x)
270
+ x, _ = nn.utils.rnn.pad_packed_sequence(
271
+ x, batch_first=True)
272
+
273
+ x = x.transpose(-1, -2)
274
+ x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
275
+
276
+ x_pad[:, :, :x.shape[-1]] = x
277
+ x = x_pad.to(x.device)
278
+
279
+ x.masked_fill_(m, 0.0)
280
+
281
+ return x
282
+
283
+ def inference(self, x):
284
+ x = self.embedding(x)
285
+ x = x.transpose(1, 2)
286
+ x = self.cnn(x)
287
+ x = x.transpose(1, 2)
288
+ self.lstm.flatten_parameters()
289
+ x, _ = self.lstm(x)
290
+ return x
291
+
292
+ def length_to_mask(self, lengths):
293
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
294
+ mask = torch.gt(mask+1, lengths.unsqueeze(1))
295
+ return mask
296
+
297
+
298
+
299
+ class AdaIN1d(nn.Module):
300
+ def __init__(self, style_dim, num_features):
301
+ super().__init__()
302
+ self.norm = nn.InstanceNorm1d(num_features, affine=False)
303
+ self.fc = nn.Linear(style_dim, num_features*2)
304
+
305
+ def forward(self, x, s):
306
+ h = self.fc(s)
307
+ h = h.view(h.size(0), h.size(1), 1)
308
+ gamma, beta = torch.chunk(h, chunks=2, dim=1)
309
+ return (1 + gamma) * self.norm(x) + beta
310
+
311
+ class UpSample1d(nn.Module):
312
+ def __init__(self, layer_type):
313
+ super().__init__()
314
+ self.layer_type = layer_type
315
+
316
+ def forward(self, x):
317
+ if self.layer_type == 'none':
318
+ return x
319
+ else:
320
+ return F.interpolate(x, scale_factor=2, mode='nearest')
321
+
322
+ class AdainResBlk1d(nn.Module):
323
+ def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
324
+ upsample='none', dropout_p=0.0):
325
+ super().__init__()
326
+ self.actv = actv
327
+ self.upsample_type = upsample
328
+ self.upsample = UpSample1d(upsample)
329
+ self.learned_sc = dim_in != dim_out
330
+ self._build_weights(dim_in, dim_out, style_dim)
331
+ self.dropout = nn.Dropout(dropout_p)
332
+
333
+ if upsample == 'none':
334
+ self.pool = nn.Identity()
335
+ else:
336
+ self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
337
+
338
+
339
+ def _build_weights(self, dim_in, dim_out, style_dim):
340
+ self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
341
+ self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
342
+ self.norm1 = AdaIN1d(style_dim, dim_in)
343
+ self.norm2 = AdaIN1d(style_dim, dim_out)
344
+ if self.learned_sc:
345
+ self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
346
+
347
+ def _shortcut(self, x):
348
+ x = self.upsample(x)
349
+ if self.learned_sc:
350
+ x = self.conv1x1(x)
351
+ return x
352
+
353
+ def _residual(self, x, s):
354
+ x = self.norm1(x, s)
355
+ x = self.actv(x)
356
+ x = self.pool(x)
357
+ x = self.conv1(self.dropout(x))
358
+ x = self.norm2(x, s)
359
+ x = self.actv(x)
360
+ x = self.conv2(self.dropout(x))
361
+ return x
362
+
363
+ def forward(self, x, s):
364
+ out = self._residual(x, s)
365
+ out = (out + self._shortcut(x)) / math.sqrt(2)
366
+ return out
367
+
368
+ class AdaLayerNorm(nn.Module):
369
+ def __init__(self, style_dim, channels, eps=1e-5):
370
+ super().__init__()
371
+ self.channels = channels
372
+ self.eps = eps
373
+
374
+ self.fc = nn.Linear(style_dim, channels*2)
375
+
376
+ def forward(self, x, s):
377
+ x = x.transpose(-1, -2)
378
+ x = x.transpose(1, -1)
379
+
380
+ h = self.fc(s)
381
+ h = h.view(h.size(0), h.size(1), 1)
382
+ gamma, beta = torch.chunk(h, chunks=2, dim=1)
383
+ gamma, beta = gamma.transpose(1, -1), beta.transpose(1, -1)
384
+
385
+
386
+ x = F.layer_norm(x, (self.channels,), eps=self.eps)
387
+ x = (1 + gamma) * x + beta
388
+ return x.transpose(1, -1).transpose(-1, -2)
389
+
390
+ class ProsodyPredictor(nn.Module):
391
+
392
+ def __init__(self, style_dim, d_hid, nlayers, max_dur=50, dropout=0.1):
393
+ super().__init__()
394
+
395
+ self.text_encoder = DurationEncoder(sty_dim=style_dim,
396
+ d_model=d_hid,
397
+ nlayers=nlayers,
398
+ dropout=dropout)
399
+
400
+ self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
401
+ self.duration_proj = LinearNorm(d_hid, max_dur)
402
+
403
+ self.shared = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
404
+ self.F0 = nn.ModuleList()
405
+ self.F0.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
406
+ self.F0.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
407
+ self.F0.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
408
+
409
+ self.N = nn.ModuleList()
410
+ self.N.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
411
+ self.N.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
412
+ self.N.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
413
+
414
+ self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
415
+ self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
416
+
417
+
418
+ def forward(self, texts, style, text_lengths, alignment, m):
419
+ d = self.text_encoder(texts, style, text_lengths, m)
420
+
421
+ batch_size = d.shape[0]
422
+ text_size = d.shape[1]
423
+
424
+ # predict duration
425
+ input_lengths = text_lengths.cpu().numpy()
426
+ x = nn.utils.rnn.pack_padded_sequence(
427
+ d, input_lengths, batch_first=True, enforce_sorted=False)
428
+
429
+ m = m.to(text_lengths.device).unsqueeze(1)
430
+
431
+ self.lstm.flatten_parameters()
432
+ x, _ = self.lstm(x)
433
+ x, _ = nn.utils.rnn.pad_packed_sequence(
434
+ x, batch_first=True)
435
+
436
+ x_pad = torch.zeros([x.shape[0], m.shape[-1], x.shape[-1]])
437
+
438
+ x_pad[:, :x.shape[1], :] = x
439
+ x = x_pad.to(x.device)
440
+
441
+ duration = self.duration_proj(nn.functional.dropout(x, 0.5, training=self.training))
442
+
443
+ en = (d.transpose(-1, -2) @ alignment)
444
+
445
+ return duration.squeeze(-1), en
446
+
447
+ def F0Ntrain(self, x, s):
448
+ x, _ = self.shared(x.transpose(-1, -2))
449
+
450
+ F0 = x.transpose(-1, -2)
451
+ for block in self.F0:
452
+ F0 = block(F0, s)
453
+ F0 = self.F0_proj(F0)
454
+
455
+ N = x.transpose(-1, -2)
456
+ for block in self.N:
457
+ N = block(N, s)
458
+ N = self.N_proj(N)
459
+
460
+ return F0.squeeze(1), N.squeeze(1)
461
+
462
+ def length_to_mask(self, lengths):
463
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
464
+ mask = torch.gt(mask+1, lengths.unsqueeze(1))
465
+ return mask
466
+
467
+ class DurationEncoder(nn.Module):
468
+
469
+ def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):
470
+ super().__init__()
471
+ self.lstms = nn.ModuleList()
472
+ for _ in range(nlayers):
473
+ self.lstms.append(nn.LSTM(d_model + sty_dim,
474
+ d_model // 2,
475
+ num_layers=1,
476
+ batch_first=True,
477
+ bidirectional=True,
478
+ dropout=dropout))
479
+ self.lstms.append(AdaLayerNorm(sty_dim, d_model))
480
+
481
+
482
+ self.dropout = dropout
483
+ self.d_model = d_model
484
+ self.sty_dim = sty_dim
485
+
486
+ def forward(self, x, style, text_lengths, m):
487
+ masks = m.to(text_lengths.device)
488
+
489
+ x = x.permute(2, 0, 1)
490
+ s = style.expand(x.shape[0], x.shape[1], -1)
491
+ x = torch.cat([x, s], axis=-1)
492
+ x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
493
+
494
+ x = x.transpose(0, 1)
495
+ input_lengths = text_lengths.cpu().numpy()
496
+ x = x.transpose(-1, -2)
497
+
498
+ for block in self.lstms:
499
+ if isinstance(block, AdaLayerNorm):
500
+ x = block(x.transpose(-1, -2), style).transpose(-1, -2)
501
+ x = torch.cat([x, s.permute(1, -1, 0)], axis=1)
502
+ x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
503
+ else:
504
+ x = x.transpose(-1, -2)
505
+ x = nn.utils.rnn.pack_padded_sequence(
506
+ x, input_lengths, batch_first=True, enforce_sorted=False)
507
+ block.flatten_parameters()
508
+ x, _ = block(x)
509
+ x, _ = nn.utils.rnn.pad_packed_sequence(
510
+ x, batch_first=True)
511
+ x = F.dropout(x, p=self.dropout, training=self.training)
512
+ x = x.transpose(-1, -2)
513
+
514
+ x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
515
+
516
+ x_pad[:, :, :x.shape[-1]] = x
517
+ x = x_pad.to(x.device)
518
+
519
+ return x.transpose(-1, -2)
520
+
521
+ def inference(self, x, style):
522
+ x = self.embedding(x.transpose(-1, -2)) * math.sqrt(self.d_model)
523
+ style = style.expand(x.shape[0], x.shape[1], -1)
524
+ x = torch.cat([x, style], axis=-1)
525
+ src = self.pos_encoder(x)
526
+ output = self.transformer_encoder(src).transpose(0, 1)
527
+ return output
528
+
529
+ def length_to_mask(self, lengths):
530
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
531
+ mask = torch.gt(mask+1, lengths.unsqueeze(1))
532
+ return mask
reference_audio/1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2660fbd4b0f4119dcf69894c10f1e4d4ce9221d155524d5f8b1720d78ecf492e
3
+ size 96044
reference_audio/2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ff7ff9e9daca66ea9f325a1fbaacf256109621d475459f53844a429c0c2465d
3
+ size 96044
reference_audio/3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a7d39beddd2c24d864163ce38e799b261ab0bc23cbea492f0ece046feb131f1
3
+ size 145484
reference_audio/vn_1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f6a0e132a37aa6d28a610eed3daa151309c4a98453d1da1d94d9e88c8438f8c
3
+ size 793166
reference_audio/vn_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40e15cabd0f772604a3e6a1d76d1994787974ed359f6c898fdea7685a402773b
3
+ size 1015730
reference_audio/vn_3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2084ed5953b362d2001aaad812ed519311e967f2927a50eba19a80c727671634
3
+ size 876466
reference_audio/vn_4.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c36531e92a32cf4a64d73aba6b5d3ba7272d5f8d19b284f2b440ed6edbbcee08
3
+ size 353638
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ numpy
4
+ PyYAML
5
+ munch
6
+ nltk
7
+ librosa
8
+ noisereduce
9
+ phonemizer
10
+ espeakng-loader
run.ipynb ADDED
The diff for this file is too large to render. See raw diff