|
import os |
|
import os.path as osp |
|
import sys |
|
import time |
|
from collections import defaultdict |
|
|
|
import matplotlib |
|
import numpy as np |
|
import soundfile as sf |
|
import torch |
|
from torch import nn |
|
import jiwer |
|
|
|
import matplotlib.pylab as plt |
|
import functools |
|
import os |
|
import random |
|
import traceback |
|
from pathlib import Path |
|
from typing import Any, Dict, List, Optional, Tuple |
|
|
|
import librosa |
|
import numpy as np |
|
import torch |
|
from einops import rearrange |
|
from scipy import ndimage |
|
from torch.special import gammaln |
|
|
|
|
|
def calc_wer(target, pred, ignore_indexes=[0]): |
|
target_chars = drop_duplicated(list(filter(lambda x: x not in ignore_indexes, map(str, list(target))))) |
|
pred_chars = drop_duplicated(list(filter(lambda x: x not in ignore_indexes, map(str, list(pred))))) |
|
target_str = ' '.join(target_chars) |
|
pred_str = ' '.join(pred_chars) |
|
error = jiwer.wer(target_str, pred_str) |
|
return error |
|
|
|
def drop_duplicated(chars): |
|
ret_chars = [chars[0]] |
|
for prev, curr in zip(chars[:-1], chars[1:]): |
|
if prev != curr: |
|
ret_chars.append(curr) |
|
return ret_chars |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_criterion(critic_params={}): |
|
criterion = { |
|
"ce": nn.CrossEntropyLoss(ignore_index=-1), |
|
"ctc": torch.nn.CTCLoss(**critic_params.get('ctc', {})), |
|
} |
|
return criterion |
|
|
|
|
|
|
|
def get_data_path_list(train_path=None, val_path=None): |
|
if train_path is None: |
|
train_path = "Data/train_list.txt" |
|
if val_path is None: |
|
val_path = "Data/val_list.txt" |
|
|
|
with open(train_path, 'r') as f: |
|
train_list = f.readlines() |
|
with open(val_path, 'r') as f: |
|
val_list = f.readlines() |
|
|
|
return train_list, val_list |
|
|
|
|
|
def plot_image(image): |
|
fig, ax = plt.subplots(figsize=(10, 2)) |
|
im = ax.imshow(image, aspect="auto", origin="lower", |
|
interpolation='none') |
|
|
|
fig.canvas.draw() |
|
plt.close() |
|
|
|
return fig |
|
|
|
|
|
|
|
class PartialConv1d(torch.nn.Conv1d): |
|
""" |
|
Zero padding creates a unique identifier for where the edge of the data is, such that the model can almost always identify |
|
exactly where it is relative to either edge given a sufficient receptive field. Partial padding goes to some lengths to remove |
|
this affect. |
|
""" |
|
|
|
__constants__ = ['slide_winsize'] |
|
slide_winsize: float |
|
|
|
def __init__(self, *args, **kwargs): |
|
super(PartialConv1d, self).__init__(*args, **kwargs) |
|
weight_maskUpdater = torch.ones(1, 1, self.kernel_size[0]) |
|
self.register_buffer("weight_maskUpdater", weight_maskUpdater, persistent=False) |
|
self.slide_winsize = self.weight_maskUpdater.shape[1] * self.weight_maskUpdater.shape[2] |
|
|
|
def forward(self, input, mask_in): |
|
if mask_in is None: |
|
mask = torch.ones(1, 1, input.shape[2], dtype=input.dtype, device=input.device) |
|
else: |
|
mask = mask_in |
|
input = torch.mul(input, mask) |
|
with torch.no_grad(): |
|
update_mask = F.conv1d( |
|
mask, |
|
self.weight_maskUpdater, |
|
bias=None, |
|
stride=self.stride, |
|
padding=self.padding, |
|
dilation=self.dilation, |
|
groups=1, |
|
) |
|
update_mask_filled = torch.masked_fill(update_mask, update_mask == 0, self.slide_winsize) |
|
mask_ratio = self.slide_winsize / update_mask_filled |
|
update_mask = torch.clamp(update_mask, 0, 1) |
|
mask_ratio = torch.mul(mask_ratio, update_mask) |
|
|
|
raw_out = self._conv_forward(input, self.weight, self.bias) |
|
|
|
if self.bias is not None: |
|
bias_view = self.bias.view(1, self.out_channels, 1) |
|
output = torch.mul(raw_out - bias_view, mask_ratio) + bias_view |
|
output = torch.mul(output, update_mask) |
|
else: |
|
output = torch.mul(raw_out, mask_ratio) |
|
|
|
return output |
|
|
|
|
|
class LinearNorm(torch.nn.Module): |
|
def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'): |
|
super().__init__() |
|
self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias) |
|
|
|
torch.nn.init.xavier_uniform_(self.linear_layer.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) |
|
|
|
def forward(self, x): |
|
return self.linear_layer(x) |
|
|
|
|
|
class ConvNorm(torch.nn.Module): |
|
__constants__ = ['use_partial_padding'] |
|
use_partial_padding: bool |
|
|
|
def __init__( |
|
self, |
|
in_channels, |
|
out_channels, |
|
kernel_size=1, |
|
stride=1, |
|
padding=None, |
|
dilation=1, |
|
bias=True, |
|
w_init_gain='linear', |
|
use_partial_padding=False, |
|
use_weight_norm=False, |
|
norm_fn=None, |
|
): |
|
super(ConvNorm, self).__init__() |
|
if padding is None: |
|
assert kernel_size % 2 == 1 |
|
padding = int(dilation * (kernel_size - 1) / 2) |
|
self.use_partial_padding = use_partial_padding |
|
conv_fn = torch.nn.Conv1d |
|
if use_partial_padding: |
|
conv_fn = PartialConv1d |
|
self.conv = conv_fn( |
|
in_channels, |
|
out_channels, |
|
kernel_size=kernel_size, |
|
stride=stride, |
|
padding=padding, |
|
dilation=dilation, |
|
bias=bias, |
|
) |
|
torch.nn.init.xavier_uniform_(self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) |
|
if use_weight_norm: |
|
self.conv = torch.nn.utils.weight_norm(self.conv) |
|
if norm_fn is not None: |
|
self.norm = norm_fn(out_channels, affine=True) |
|
else: |
|
self.norm = None |
|
|
|
def forward(self, signal, mask=None): |
|
if self.use_partial_padding: |
|
ret = self.conv(signal, mask) |
|
if self.norm is not None: |
|
ret = self.norm(ret, mask) |
|
else: |
|
if mask is not None: |
|
signal = signal.mul(mask) |
|
ret = self.conv(signal) |
|
if self.norm is not None: |
|
ret = self.norm(ret) |
|
|
|
|
|
|
|
|
|
return ret |
|
|
|
|
|
|
|
class BetaBinomialInterpolator: |
|
""" |
|
This module calculates alignment prior matrices (based on beta-binomial distribution) using cached popular sizes and image interpolation. |
|
The implementation is taken from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py |
|
""" |
|
|
|
def __init__(self, round_mel_len_to=50, round_text_len_to=10, cache_size=500, scaling_factor: float = 1.0): |
|
self.round_mel_len_to = round_mel_len_to |
|
self.round_text_len_to = round_text_len_to |
|
cached_func = lambda x, y: beta_binomial_prior_distribution(x, y, scaling_factor=scaling_factor) |
|
self.bank = functools.lru_cache(maxsize=cache_size)(cached_func) |
|
|
|
@staticmethod |
|
def round(val, to): |
|
return max(1, int(np.round((val + 1) / to))) * to |
|
|
|
def __call__(self, w, h): |
|
bw = BetaBinomialInterpolator.round(w, to=self.round_mel_len_to) |
|
bh = BetaBinomialInterpolator.round(h, to=self.round_text_len_to) |
|
ret = ndimage.zoom(self.bank(bw, bh).T, zoom=(w / bw, h / bh), order=1) |
|
assert ret.shape[0] == w, ret.shape |
|
assert ret.shape[1] == h, ret.shape |
|
return ret |
|
|
|
|
|
def general_padding(item, item_len, max_len, pad_value=0): |
|
if item_len < max_len: |
|
item = torch.nn.functional.pad(item, (0, max_len - item_len), value=pad_value) |
|
return item |
|
|
|
|
|
def stack_tensors(tensors: List[torch.Tensor], max_lens: List[int], pad_value: float = 0.0) -> torch.Tensor: |
|
""" |
|
Create batch by stacking input tensor list along the time axes. |
|
|
|
Args: |
|
tensors: List of tensors to pad and stack |
|
max_lens: List of lengths to pad each axis to, starting with the last axis |
|
pad_value: Value for padding |
|
|
|
Returns: |
|
Padded and stacked tensor. |
|
""" |
|
padded_tensors = [] |
|
for tensor in tensors: |
|
padding = [] |
|
for i, max_len in enumerate(max_lens, 1): |
|
padding += [0, max_len - tensor.shape[-i]] |
|
|
|
padded_tensor = torch.nn.functional.pad(tensor, pad=padding, value=pad_value) |
|
padded_tensors.append(padded_tensor) |
|
|
|
stacked_tensor = torch.stack(padded_tensors) |
|
return stacked_tensor |
|
|
|
|
|
def logbeta(x, y): |
|
return gammaln(x) + gammaln(y) - gammaln(x + y) |
|
|
|
|
|
def logcombinations(n, k): |
|
return gammaln(n + 1) - gammaln(k + 1) - gammaln(n - k + 1) |
|
|
|
|
|
def logbetabinom(n, a, b, x): |
|
return logcombinations(n, x) + logbeta(x + a, n - x + b) - logbeta(a, b) |
|
|
|
|
|
def beta_binomial_prior_distribution(phoneme_count: int, mel_count: int, scaling_factor: float = 1.0) -> np.array: |
|
x = rearrange(torch.arange(0, phoneme_count), "b -> 1 b") |
|
y = rearrange(torch.arange(1, mel_count + 1), "b -> b 1") |
|
a = scaling_factor * y |
|
b = scaling_factor * (mel_count + 1 - y) |
|
n = torch.FloatTensor([phoneme_count - 1]) |
|
|
|
return logbetabinom(n, a, b, x).exp().numpy() |
|
|
|
|
|
|
|
|