Commit
·
d57f9a7
1
Parent(s):
e059497
More duration normalization
Browse files- .gitignore +5 -2
- inference.py +25 -8
- run.ipynb +0 -0
.gitignore
CHANGED
@@ -1,5 +1,8 @@
|
|
1 |
-
__pycache__/inference.cpython-311.pyc
|
2 |
-
__pycache__/models.cpython-311.pyc
|
3 |
Modules/__pycache__/__init__.cpython-311.pyc
|
4 |
Modules/__pycache__/hifigan.cpython-311.pyc
|
5 |
Modules/__pycache__/utils.cpython-311.pyc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
Modules/__pycache__/__init__.cpython-311.pyc
|
2 |
Modules/__pycache__/hifigan.cpython-311.pyc
|
3 |
Modules/__pycache__/utils.cpython-311.pyc
|
4 |
+
Modules/__pycache__/__init__.cpython-311.pyc
|
5 |
+
Modules/__pycache__/hifigan.cpython-311.pyc
|
6 |
+
Modules/__pycache__/utils.cpython-311.pyc
|
7 |
+
__pycache__/inference.cpython-311.pyc
|
8 |
+
__pycache__/models.cpython-311.pyc
|
inference.py
CHANGED
@@ -149,6 +149,22 @@ class StyleTTS2(torch.nn.Module):
|
|
149 |
def replacement(match):
|
150 |
return next(replacement_iter)
|
151 |
return replacement
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
def __load_models(self, models_path):
|
154 |
module_params = []
|
@@ -180,7 +196,7 @@ class StyleTTS2(torch.nn.Module):
|
|
180 |
device = self.get_device.device
|
181 |
denoise = min(denoise, 1)
|
182 |
if split_dur != 0: split_dur = max(int(split_dur), 1)
|
183 |
-
max_samples = 24000*
|
184 |
print("Computing the style for:", path)
|
185 |
|
186 |
wave, sr = librosa.load(path, sr=24000)
|
@@ -248,11 +264,12 @@ class StyleTTS2(torch.nn.Module):
|
|
248 |
duration = self.predictor.duration_proj(x) / speed
|
249 |
duration = torch.sigmoid(duration).sum(axis=-1)
|
250 |
|
251 |
-
if prev_d_mean != 0:#Stabilize speaking speed
|
252 |
dur_stats = torch.empty(duration.shape).normal_(mean=prev_d_mean, std=duration.std()).to(device)
|
253 |
else:
|
254 |
dur_stats = torch.empty(duration.shape).normal_(mean=duration.mean(), std=duration.std()).to(device)
|
255 |
duration = duration*(1-t) + dur_stats*t
|
|
|
256 |
|
257 |
pred_dur = torch.round(duration.squeeze()).clamp(min=1)
|
258 |
pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
|
@@ -272,7 +289,7 @@ class StyleTTS2(torch.nn.Module):
|
|
272 |
return out.squeeze().cpu().numpy(), duration.mean()
|
273 |
|
274 |
def get_styles(self, speakers, denoise=0.3, avg_style=True):
|
275 |
-
if avg_style: split_dur =
|
276 |
else: split_dur = 0
|
277 |
styles = {}
|
278 |
for id in speakers:
|
@@ -285,9 +302,9 @@ class StyleTTS2(torch.nn.Module):
|
|
285 |
}
|
286 |
return styles
|
287 |
|
288 |
-
def generate(self, text, styles, stabilize=
|
289 |
-
if stabilize:
|
290 |
-
else:
|
291 |
|
292 |
list_wav = []
|
293 |
prev_d_mean = 0
|
@@ -342,10 +359,10 @@ class StyleTTS2(torch.nn.Module):
|
|
342 |
phonem = espeak_phn(sentence, styles[speaker_id]['lang'])
|
343 |
phonem = re.sub(lang_pattern, replacement_func, phonem)
|
344 |
|
345 |
-
wav, prev_d_mean = self.__inference(phonem, current_ref_s, speed=speed, prev_d_mean=prev_d_mean, t=
|
346 |
wav = wav[4000:-4000] #Remove weird pulse and silent tokens
|
347 |
list_wav.append(wav)
|
348 |
|
349 |
final_wav = np.concatenate(list_wav)
|
350 |
-
final_wav = np.concatenate([np.zeros([
|
351 |
return final_wav
|
|
|
149 |
def replacement(match):
|
150 |
return next(replacement_iter)
|
151 |
return replacement
|
152 |
+
|
153 |
+
def __replace_outliers_zscore(self, tensor, threshold=3.0, factor=0.95):
|
154 |
+
mean = tensor.mean()
|
155 |
+
std = tensor.std()
|
156 |
+
z = (tensor - mean) / std
|
157 |
+
|
158 |
+
# Identify outliers
|
159 |
+
outlier_mask = torch.abs(z) > threshold
|
160 |
+
# Compute replacement value, respecting sign
|
161 |
+
sign = torch.sign(tensor - mean)
|
162 |
+
replacement = mean + sign * (threshold * std * factor)
|
163 |
+
|
164 |
+
result = tensor.clone()
|
165 |
+
result[outlier_mask] = replacement[outlier_mask]
|
166 |
+
|
167 |
+
return result
|
168 |
|
169 |
def __load_models(self, models_path):
|
170 |
module_params = []
|
|
|
196 |
device = self.get_device.device
|
197 |
denoise = min(denoise, 1)
|
198 |
if split_dur != 0: split_dur = max(int(split_dur), 1)
|
199 |
+
max_samples = 24000*20 #max 20 seconds ref audio
|
200 |
print("Computing the style for:", path)
|
201 |
|
202 |
wave, sr = librosa.load(path, sr=24000)
|
|
|
264 |
duration = self.predictor.duration_proj(x) / speed
|
265 |
duration = torch.sigmoid(duration).sum(axis=-1)
|
266 |
|
267 |
+
if prev_d_mean != 0:#Stabilize speaking speed between splits
|
268 |
dur_stats = torch.empty(duration.shape).normal_(mean=prev_d_mean, std=duration.std()).to(device)
|
269 |
else:
|
270 |
dur_stats = torch.empty(duration.shape).normal_(mean=duration.mean(), std=duration.std()).to(device)
|
271 |
duration = duration*(1-t) + dur_stats*t
|
272 |
+
duration[:,1:-2] = self.__replace_outliers_zscore(duration[:,1:-2]) #Normalize outlier
|
273 |
|
274 |
pred_dur = torch.round(duration.squeeze()).clamp(min=1)
|
275 |
pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
|
|
|
289 |
return out.squeeze().cpu().numpy(), duration.mean()
|
290 |
|
291 |
def get_styles(self, speakers, denoise=0.3, avg_style=True):
|
292 |
+
if avg_style: split_dur = 2
|
293 |
else: split_dur = 0
|
294 |
styles = {}
|
295 |
for id in speakers:
|
|
|
302 |
}
|
303 |
return styles
|
304 |
|
305 |
+
def generate(self, text, styles, stabilize=True, n_merge=16, default_speaker= "[id_1]"):
|
306 |
+
if stabilize: smooth_value=0.2
|
307 |
+
else: smooth_value=0
|
308 |
|
309 |
list_wav = []
|
310 |
prev_d_mean = 0
|
|
|
359 |
phonem = espeak_phn(sentence, styles[speaker_id]['lang'])
|
360 |
phonem = re.sub(lang_pattern, replacement_func, phonem)
|
361 |
|
362 |
+
wav, prev_d_mean = self.__inference(phonem, current_ref_s, speed=speed, prev_d_mean=prev_d_mean, t=smooth_value)
|
363 |
wav = wav[4000:-4000] #Remove weird pulse and silent tokens
|
364 |
list_wav.append(wav)
|
365 |
|
366 |
final_wav = np.concatenate(list_wav)
|
367 |
+
final_wav = np.concatenate([np.zeros([4000]), final_wav, np.zeros([4000])], axis=0) # add padding
|
368 |
return final_wav
|
run.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|