|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
import torch |
|
import torch.utils.data |
|
from librosa.filters import mel as librosa_mel_fn |
|
from torch.nn.utils import weight_norm, remove_weight_norm |
|
from torch.nn import Conv1d |
|
import numpy as np |
|
|
|
|
|
def init_weights(m, mean=0.0, std=0.01): |
|
classname = m.__class__.__name__ |
|
if classname.find("Conv") != -1: |
|
m.weight.data.normal_(mean, std) |
|
|
|
|
|
def get_padding(kernel_size, dilation=1): |
|
return int((kernel_size*dilation - dilation)/2) |
|
|
|
|
|
class Upsample(nn.Module): |
|
def __init__(self, mult, r): |
|
super(Upsample, self).__init__() |
|
self.r = r |
|
self.upsample = nn.Sequential(nn.Upsample(mode="nearest", scale_factor=r), |
|
nn.LeakyReLU(0.2), |
|
nn.ReflectionPad1d(3), |
|
nn.utils.weight_norm(nn.Conv1d(mult, mult // 2, kernel_size=7, stride=1)) |
|
) |
|
r_kernel = r if r >= 5 else 5 |
|
self.trans_upsample = nn.Sequential(nn.LeakyReLU(0.2), |
|
nn.utils.weight_norm(nn.ConvTranspose1d(mult, mult // 2, |
|
kernel_size=r_kernel * 2, stride=r, |
|
padding=r_kernel - r // 2, |
|
output_padding=r % 2) |
|
)) |
|
|
|
def forward(self, x): |
|
x = torch.sin(x) + x |
|
out1 = self.upsample(x) |
|
out2 = self.trans_upsample(x) |
|
return out1 + out2 |
|
|
|
|
|
class Downsample(nn.Module): |
|
def __init__(self, mult, r): |
|
super(Downsample, self).__init__() |
|
self.r = r |
|
r_kernel = r if r >= 5 else 5 |
|
self.trans_downsample = nn.Sequential(nn.LeakyReLU(0.2), |
|
nn.utils.weight_norm(nn.Conv1d(mult, mult * 2, |
|
kernel_size=r_kernel * 2, stride=r, |
|
padding=r_kernel - r // 2) |
|
)) |
|
|
|
def forward(self, x): |
|
out = self.trans_downsample(x) |
|
return out |
|
|
|
|
|
def weights_init(m): |
|
classname = m.__class__.__name__ |
|
if classname.find("Conv") != -1: |
|
m.weight.data.normal_(0.0, 0.02) |
|
elif classname.find("BatchNorm2d") != -1: |
|
m.weight.data.normal_(1.0, 0.02) |
|
m.bias.data.fill_(0) |
|
|
|
|
|
def weights_zero_init(m): |
|
classname = m.__class__.__name__ |
|
if classname.find("Conv") != -1: |
|
m.weight.data.fill_(0.0) |
|
m.bias.data.fill_(0.0) |
|
|
|
|
|
def WNConv1d(*args, **kwargs): |
|
return weight_norm(nn.Conv1d(*args, **kwargs)) |
|
|
|
|
|
def WNConvTranspose1d(*args, **kwargs): |
|
return weight_norm(nn.ConvTranspose1d(*args, **kwargs)) |
|
|
|
|
|
class Audio2Mel(nn.Module): |
|
def __init__( |
|
self, |
|
hop_length=300, |
|
sampling_rate=24000, |
|
n_mel_channels=80, |
|
mel_fmin=0., |
|
mel_fmax=None, |
|
frame_size=0.05, |
|
device='cpu' |
|
): |
|
super().__init__() |
|
|
|
|
|
|
|
|
|
self.n_fft = int(np.power(2., np.ceil(np.log(sampling_rate * frame_size) / np.log(2)))) |
|
window = torch.hann_window(int(sampling_rate * frame_size)).float() |
|
mel_basis = librosa_mel_fn( |
|
sampling_rate, self.n_fft, n_mel_channels, mel_fmin, mel_fmax |
|
) |
|
mel_basis = torch.from_numpy(mel_basis).float() |
|
self.register_buffer("mel_basis", mel_basis) |
|
self.register_buffer("window", window) |
|
|
|
self.hop_length = hop_length |
|
self.win_length = int(sampling_rate * frame_size) |
|
self.sampling_rate = sampling_rate |
|
self.n_mel_channels = n_mel_channels |
|
|
|
def forward(self, audio): |
|
fft = torch.stft( |
|
audio.squeeze(1), |
|
n_fft=self.n_fft, |
|
hop_length=self.hop_length, |
|
win_length=self.win_length, |
|
window=self.window, |
|
center=True, |
|
) |
|
real_part, imag_part = fft.unbind(-1) |
|
magnitude = torch.sqrt(torch.clamp(real_part ** 2 + imag_part ** 2, min=1e-5)) |
|
mel_output = torch.matmul(self.mel_basis, magnitude) |
|
|
|
log_mel_spec = 20 * torch.log10(torch.clamp(mel_output, min=1e-5)) - 20 |
|
norm_mel = (log_mel_spec + 115.) / 115. |
|
mel_comp = torch.clamp(norm_mel * 8. - 4., -4., 4.) |
|
|
|
return mel_comp |
|
|
|
|
|
class ResnetBlock(nn.Module): |
|
def __init__(self, dim, dilation=1, dim_in=None): |
|
super().__init__() |
|
if dim_in is None: |
|
dim_in = dim |
|
|
|
self.block = nn.Sequential( |
|
nn.LeakyReLU(0.2), |
|
nn.ReflectionPad1d(dilation), |
|
WNConv1d(dim_in, dim, kernel_size=3, dilation=dilation), |
|
nn.LeakyReLU(0.2), |
|
WNConv1d(dim, dim, kernel_size=1), |
|
) |
|
self.shortcut = WNConv1d(dim_in, dim, kernel_size=1) |
|
|
|
def forward(self, x): |
|
return self.shortcut(x) + self.block(x) |
|
|
|
|
|
''' |
|
参照hifigan(https://arxiv.org/pdf/2010.05646.pdf)v2结构 |
|
多尺度主要是kernel_size不同,3组并行卷积模块,每个卷积模块内部采用不同的串行dilation size,且中间交叉正常无dilation卷积层 |
|
''' |
|
|
|
|
|
class ResBlockMRFV2(torch.nn.Module): |
|
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): |
|
super(ResBlockMRFV2, self).__init__() |
|
self.convs1 = nn.ModuleList([ |
|
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], |
|
padding=get_padding(kernel_size, dilation[0]))), |
|
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], |
|
padding=get_padding(kernel_size, dilation[1]))), |
|
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], |
|
padding=get_padding(kernel_size, dilation[2]))) |
|
]) |
|
self.convs1.apply(init_weights) |
|
|
|
self.convs2 = nn.ModuleList([ |
|
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, |
|
padding=get_padding(kernel_size, 1))), |
|
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, |
|
padding=get_padding(kernel_size, 1))), |
|
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, |
|
padding=get_padding(kernel_size, 1))) |
|
]) |
|
self.convs2.apply(init_weights) |
|
|
|
def forward(self, x): |
|
for c1, c2 in zip(self.convs1, self.convs2): |
|
xt = F.leaky_relu(x, 0.2) |
|
xt = c1(xt) |
|
xt = F.leaky_relu(xt, 0.2) |
|
xt = c2(xt) |
|
x = xt + x |
|
return x |
|
|
|
def remove_weight_norm(self): |
|
for l in self.convs1: |
|
remove_weight_norm(l) |
|
for l in self.convs2: |
|
remove_weight_norm(l) |
|
|
|
|
|
class ResBlockMRFV2Inter(torch.nn.Module): |
|
def __init__(self, channels, kernel_size=3): |
|
super(ResBlockMRFV2Inter, self).__init__() |
|
self.block1 = ResBlockMRFV2(channels) |
|
self.block2 = ResBlockMRFV2(channels, 7) |
|
self.block3 = ResBlockMRFV2(channels, 11) |
|
|
|
def forward(self, x): |
|
xs = self.block1(x) |
|
xs += self.block2(x) |
|
xs += self.block3(x) |
|
x = xs / 3 |
|
return x |
|
|
|
|
|
class Generator(nn.Module): |
|
def __init__(self, input_size_, ngf, n_residual_layers, num_band, args, ratios=[5, 5, 4, 3], onnx_export=False, |
|
device='cpu'): |
|
super().__init__() |
|
self.hop_length = args.frame_shift |
|
self.args = args |
|
self.onnx_export = onnx_export |
|
|
|
|
|
mult = int(2 ** len(ratios)) |
|
model_up = [] |
|
input_size = input_size_ |
|
model_up += [ |
|
nn.ReflectionPad1d(3), |
|
WNConv1d(input_size, mult * ngf, kernel_size=7, padding=0), |
|
] |
|
|
|
|
|
for i, r in enumerate(ratios): |
|
model_up += [Upsample(mult * ngf, r)] |
|
model_up += [ResBlockMRFV2Inter(mult * ngf // 2)] |
|
mult //= 2 |
|
|
|
model_up += [ |
|
nn.LeakyReLU(0.2), |
|
nn.ReflectionPad1d(3), |
|
WNConv1d(ngf, num_band, kernel_size=7, padding=0), |
|
nn.Tanh(), |
|
] |
|
if not args.use_tanh: |
|
model_up[-1] = nn.Conv1d(num_band, num_band, 1) |
|
model_up[-2].apply(weights_zero_init) |
|
|
|
self.model_up = nn.Sequential(*model_up) |
|
|
|
self.apply(weights_init) |
|
|
|
def forward(self, mel, step=None): |
|
|
|
if self.onnx_export: |
|
mel = mel.transpose(1, 2) |
|
|
|
|
|
|
|
x = mel |
|
|
|
|
|
cnt_after_upsample = 0 |
|
|
|
for i, m in enumerate(self.model_up): |
|
x = m(x) |
|
|
|
if type(m) == Upsample: |
|
cnt_after_upsample += 1 |
|
|
|
return x |