dangtr0408 commited on
Commit
d57f9a7
·
1 Parent(s): e059497

More duration normalization

Browse files
Files changed (3) hide show
  1. .gitignore +5 -2
  2. inference.py +25 -8
  3. run.ipynb +0 -0
.gitignore CHANGED
@@ -1,5 +1,8 @@
1
- __pycache__/inference.cpython-311.pyc
2
- __pycache__/models.cpython-311.pyc
3
  Modules/__pycache__/__init__.cpython-311.pyc
4
  Modules/__pycache__/hifigan.cpython-311.pyc
5
  Modules/__pycache__/utils.cpython-311.pyc
 
 
 
 
 
 
 
 
1
  Modules/__pycache__/__init__.cpython-311.pyc
2
  Modules/__pycache__/hifigan.cpython-311.pyc
3
  Modules/__pycache__/utils.cpython-311.pyc
4
+ Modules/__pycache__/__init__.cpython-311.pyc
5
+ Modules/__pycache__/hifigan.cpython-311.pyc
6
+ Modules/__pycache__/utils.cpython-311.pyc
7
+ __pycache__/inference.cpython-311.pyc
8
+ __pycache__/models.cpython-311.pyc
inference.py CHANGED
@@ -149,6 +149,22 @@ class StyleTTS2(torch.nn.Module):
149
  def replacement(match):
150
  return next(replacement_iter)
151
  return replacement
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
  def __load_models(self, models_path):
154
  module_params = []
@@ -180,7 +196,7 @@ class StyleTTS2(torch.nn.Module):
180
  device = self.get_device.device
181
  denoise = min(denoise, 1)
182
  if split_dur != 0: split_dur = max(int(split_dur), 1)
183
- max_samples = 24000*30 #max 30 seconds ref audio
184
  print("Computing the style for:", path)
185
 
186
  wave, sr = librosa.load(path, sr=24000)
@@ -248,11 +264,12 @@ class StyleTTS2(torch.nn.Module):
248
  duration = self.predictor.duration_proj(x) / speed
249
  duration = torch.sigmoid(duration).sum(axis=-1)
250
 
251
- if prev_d_mean != 0:#Stabilize speaking speed
252
  dur_stats = torch.empty(duration.shape).normal_(mean=prev_d_mean, std=duration.std()).to(device)
253
  else:
254
  dur_stats = torch.empty(duration.shape).normal_(mean=duration.mean(), std=duration.std()).to(device)
255
  duration = duration*(1-t) + dur_stats*t
 
256
 
257
  pred_dur = torch.round(duration.squeeze()).clamp(min=1)
258
  pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
@@ -272,7 +289,7 @@ class StyleTTS2(torch.nn.Module):
272
  return out.squeeze().cpu().numpy(), duration.mean()
273
 
274
  def get_styles(self, speakers, denoise=0.3, avg_style=True):
275
- if avg_style: split_dur = 3
276
  else: split_dur = 0
277
  styles = {}
278
  for id in speakers:
@@ -285,9 +302,9 @@ class StyleTTS2(torch.nn.Module):
285
  }
286
  return styles
287
 
288
- def generate(self, text, styles, stabilize=False, n_merge=14, default_speaker= "[id_1]"):
289
- if stabilize: smooth_dur=0.2
290
- else: smooth_dur=0
291
 
292
  list_wav = []
293
  prev_d_mean = 0
@@ -342,10 +359,10 @@ class StyleTTS2(torch.nn.Module):
342
  phonem = espeak_phn(sentence, styles[speaker_id]['lang'])
343
  phonem = re.sub(lang_pattern, replacement_func, phonem)
344
 
345
- wav, prev_d_mean = self.__inference(phonem, current_ref_s, speed=speed, prev_d_mean=prev_d_mean, t=smooth_dur)
346
  wav = wav[4000:-4000] #Remove weird pulse and silent tokens
347
  list_wav.append(wav)
348
 
349
  final_wav = np.concatenate(list_wav)
350
- final_wav = np.concatenate([np.zeros([12000]), final_wav, np.zeros([12000])], axis=0) # 0.5 second padding
351
  return final_wav
 
149
  def replacement(match):
150
  return next(replacement_iter)
151
  return replacement
152
+
153
+ def __replace_outliers_zscore(self, tensor, threshold=3.0, factor=0.95):
154
+ mean = tensor.mean()
155
+ std = tensor.std()
156
+ z = (tensor - mean) / std
157
+
158
+ # Identify outliers
159
+ outlier_mask = torch.abs(z) > threshold
160
+ # Compute replacement value, respecting sign
161
+ sign = torch.sign(tensor - mean)
162
+ replacement = mean + sign * (threshold * std * factor)
163
+
164
+ result = tensor.clone()
165
+ result[outlier_mask] = replacement[outlier_mask]
166
+
167
+ return result
168
 
169
  def __load_models(self, models_path):
170
  module_params = []
 
196
  device = self.get_device.device
197
  denoise = min(denoise, 1)
198
  if split_dur != 0: split_dur = max(int(split_dur), 1)
199
+ max_samples = 24000*20 #max 20 seconds ref audio
200
  print("Computing the style for:", path)
201
 
202
  wave, sr = librosa.load(path, sr=24000)
 
264
  duration = self.predictor.duration_proj(x) / speed
265
  duration = torch.sigmoid(duration).sum(axis=-1)
266
 
267
+ if prev_d_mean != 0:#Stabilize speaking speed between splits
268
  dur_stats = torch.empty(duration.shape).normal_(mean=prev_d_mean, std=duration.std()).to(device)
269
  else:
270
  dur_stats = torch.empty(duration.shape).normal_(mean=duration.mean(), std=duration.std()).to(device)
271
  duration = duration*(1-t) + dur_stats*t
272
+ duration[:,1:-2] = self.__replace_outliers_zscore(duration[:,1:-2]) #Normalize outlier
273
 
274
  pred_dur = torch.round(duration.squeeze()).clamp(min=1)
275
  pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
 
289
  return out.squeeze().cpu().numpy(), duration.mean()
290
 
291
  def get_styles(self, speakers, denoise=0.3, avg_style=True):
292
+ if avg_style: split_dur = 2
293
  else: split_dur = 0
294
  styles = {}
295
  for id in speakers:
 
302
  }
303
  return styles
304
 
305
+ def generate(self, text, styles, stabilize=True, n_merge=16, default_speaker= "[id_1]"):
306
+ if stabilize: smooth_value=0.2
307
+ else: smooth_value=0
308
 
309
  list_wav = []
310
  prev_d_mean = 0
 
359
  phonem = espeak_phn(sentence, styles[speaker_id]['lang'])
360
  phonem = re.sub(lang_pattern, replacement_func, phonem)
361
 
362
+ wav, prev_d_mean = self.__inference(phonem, current_ref_s, speed=speed, prev_d_mean=prev_d_mean, t=smooth_value)
363
  wav = wav[4000:-4000] #Remove weird pulse and silent tokens
364
  list_wav.append(wav)
365
 
366
  final_wav = np.concatenate(list_wav)
367
+ final_wav = np.concatenate([np.zeros([4000]), final_wav, np.zeros([4000])], axis=0) # add padding
368
  return final_wav
run.ipynb CHANGED
The diff for this file is too large to render. See raw diff