Spaces:

qgyd2021
/

nx_denoise

Running

App Files Files Community

HoneyTian commited on Mar 4

Commit

2c570c3

1 Parent(s): f86fc1a

update

Browse files

Files changed (10) hide show

examples/nx_clean_unet/yaml/config.yaml +8 -0
toolbox/torchaudio/models/nx_clean_unet/causal_convolution/__init__.py +6 -0
toolbox/torchaudio/models/nx_clean_unet/causal_convolution/causal_conv2d.py +261 -0
toolbox/torchaudio/models/nx_clean_unet/configuration_nx_clean_unet.py +18 -0
toolbox/torchaudio/models/nx_clean_unet/enhanced_audio.wav +0 -0
toolbox/torchaudio/models/nx_clean_unet/inference_nx_clean_unet.py +4 -3
toolbox/torchaudio/models/nx_clean_unet/modeling_nx_clean_unet.py +11 -0
toolbox/torchaudio/models/nx_clean_unet/transformer/attention.py +1 -1
toolbox/torchaudio/models/nx_clean_unet/transformer/transformer.py +25 -30
toolbox/torchaudio/models/nx_clean_unet/yaml/config.yaml +19 -10

examples/nx_clean_unet/yaml/config.yaml CHANGED Viewed

@@ -12,6 +12,14 @@ down_sampling_hidden_channels: 64
 down_sampling_kernel_size: 4
 down_sampling_stride: 2
 tsfm_hidden_size: 256
 tsfm_attention_heads: 8
 tsfm_num_blocks: 6

 down_sampling_kernel_size: 4
 down_sampling_stride: 2
+causal_in_channels: 64
+causal_out_channels: 64
+causal_kernel_size: 3
+causal_bias: false
+causal_separable: true
+causal_f_stride: 1
+causal_num_layers: 3
 tsfm_hidden_size: 256
 tsfm_attention_heads: 8
 tsfm_num_blocks: 6

toolbox/torchaudio/models/nx_clean_unet/causal_convolution/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/torchaudio/models/nx_clean_unet/causal_convolution/causal_conv2d.py ADDED Viewed

	@@ -0,0 +1,261 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import math
+import os
+from typing import List, Optional, Union, Iterable
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+norm_layer_dict = {
+    "batch_norm_2d": torch.nn.BatchNorm2d
+}
+activation_layer_dict = {
+    "relu": torch.nn.ReLU,
+    "identity": torch.nn.Identity,
+    "sigmoid": torch.nn.Sigmoid,
+}
+class CausalConv2d(nn.Module):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Iterable[int]],
+                 f_stride: int = 1,
+                 dilation: int = 1,
+                 do_f_pad: bool = True,
+                 bias: bool = True,
+                 separable: bool = False,
+                 norm_layer: str = "batch_norm_2d",
+                 activation_layer: str = "relu",
+                 lookahead: int = 0
+                 ):
+        super(CausalConv2d, self).__init__()
+        kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else tuple(kernel_size)
+        if do_f_pad:
+            f_pad = kernel_size[1] // 2 + dilation - 1
+        else:
+            f_pad = 0
+        self.causal_left_pad = kernel_size[0] - 1 - lookahead
+        self.causal_right_pad = lookahead
+        self.constant_pad = nn.ConstantPad2d(
+            padding=(0, 0, self.causal_left_pad, self.causal_right_pad),
+            value=0.0
+        )
+        groups = math.gcd(in_channels, out_channels) if separable else 1
+        self.conv1 = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=(0, f_pad),
+            stride=(1, f_stride),
+            dilation=(1, dilation),
+            groups=groups,
+            bias=bias,
+        )
+        self.conv2 = None
+        if not any([groups == 1, max(kernel_size) == 1]):
+            self.conv2 = nn.Conv2d(
+                out_channels,
+                out_channels,
+                kernel_size=1,
+                bias=False,
+            )
+        self.norm = None
+        if norm_layer is not None:
+            norm_layer = norm_layer_dict[norm_layer]
+            self.norm = norm_layer(out_channels)
+        self.activation = None
+        if activation_layer is not None:
+            activation_layer = activation_layer_dict[activation_layer]
+            self.activation = activation_layer()
+    def forward(self,
+                inputs: torch.Tensor,
+                causal_cache: torch.Tensor = None,
+                ):
+        if causal_cache is None:
+            # inputs shape: [batch_size, 1, time_steps, hidden_size]
+            x = self.constant_pad.forward(inputs)
+        else:
+            # inputs shape: [batch_size, 1, time_steps + self.causal_right_pad, hidden_size]
+            # causal_cache shape: [batch_size, 1, self.causal_left_pad, hidden_size]
+            x = torch.concat(tensors=[causal_cache, inputs], dim=2)
+        # x shape: [batch_size, 1, time_steps2, hidden_size]
+        # time_steps2 = time_steps + self.causal_left_pad + self.causal_right_pad
+        x = self.conv1.forward(x)
+        # inputs shape: [batch_size, 1, time_steps, hidden_size]
+        if self.conv2:
+            x = self.conv2.forward(x)
+        if self.norm:
+            x = self.norm(x)
+        if self.activation:
+            x = self.activation(x)
+        causal_cache = x[:, :, -self.causal_left_pad:, :]
+        # inputs shape: [batch_size, 1, time_steps, hidden_size]
+        return x, causal_cache
+class CausalConv2dEncoder(nn.Module):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Iterable[int]],
+                 f_stride: int = 1,
+                 dilation: int = 1,
+                 do_f_pad: bool = True,
+                 bias: bool = True,
+                 separable: bool = False,
+                 norm_layer: str = "batch_norm_2d",
+                 activation_layer: str = "relu",
+                 lookahead: int = 0,
+                 num_layers: int = 5,
+                 ):
+        super(CausalConv2dEncoder, self).__init__()
+        self.num_layers = num_layers
+        self.total_causal_left_pad = 0
+        self.total_causal_right_pad = 0
+        self.causal_conv_list: List[CausalConv2d] = nn.ModuleList(modules=[])
+        for i_layer in range(num_layers):
+            conv = CausalConv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                f_stride=f_stride,
+                dilation=dilation,
+                do_f_pad=do_f_pad,
+                bias=bias,
+                separable=separable,
+                norm_layer=norm_layer,
+                activation_layer=activation_layer,
+                lookahead=lookahead,
+            )
+            self.causal_conv_list.append(conv)
+            self.total_causal_left_pad += conv.causal_left_pad
+            self.total_causal_right_pad += conv.causal_right_pad
+            in_channels = out_channels
+    def forward(self, inputs: torch.Tensor):
+        # inputs shape: [batch_size, 1, time_steps, hidden_size]
+        x = inputs
+        for layer in self.causal_conv_list:
+            x, _ = layer.forward(x)
+        return x
+    def forward_chunk(self,
+                      chunk: torch.Tensor,
+                      causal_cache: torch.Tensor = None,
+                      ):
+        # causal_cache shape: [self.num_layers, 1, causal_left_pad, hidden_size]
+        new_causal_cache_list = list()
+        for idx, causal_conv in enumerate(self.causal_conv_list):
+            chunk, new_causal_cache = causal_conv.forward(
+                inputs=chunk, causal_cache=causal_cache[idx: idx+1] if causal_cache is not None else None
+            )
+            new_causal_cache_list.append(new_causal_cache)
+        new_causal_cache = torch.cat(new_causal_cache_list, dim=0)
+        return chunk, new_causal_cache
+    def forward_chunk_by_chunk(self, inputs: torch.Tensor):
+        # inputs shape: [batch_size, 1, time_steps, hidden_size]
+        # batch_size = 1
+        batch_size, channels, time_steps, hidden_size = inputs.shape
+        causal_cache = None
+        outputs = []
+        for idx in range(0, time_steps, 1):
+            begin = idx
+            end = begin + self.total_causal_right_pad + 1
+            chunk_xs = inputs[:, :, begin:end, :]
+            ys, attention_cache = self.forward_chunk(
+                chunk=chunk_xs,
+                causal_cache=causal_cache,
+            )
+            # ys shape: [batch_size, channels, self.total_causal_right_pad + 1 , hidden_size]
+            ys = ys[:, :, :1, :]
+            # ys shape: [batch_size, chunk_size, hidden_size]
+            outputs.append(ys)
+        ys = torch.cat(outputs, 2)
+        return ys
+def main2():
+    conv = CausalConv2d(
+        in_channels=1,
+        out_channels=64,
+        kernel_size=3,
+        bias=False,
+        separable=True,
+        f_stride=1,
+        lookahead=0,
+    )
+    spec = torch.randn(size=(1, 1, 200, 64), dtype=torch.float32)
+    # spec shape: [batch_size, 1, time_steps, hidden_size]
+    cache = torch.randn(size=(1, 1, conv.causal_left_pad, 64), dtype=torch.float32)
+    output, _ = conv.forward(spec)
+    print(output.shape)
+    output, _ = conv.forward(spec, cache)
+    print(output.shape)
+    return
+def main():
+    causal = CausalConv2dEncoder(
+        in_channels=1,
+        out_channels=64,
+        kernel_size=3,
+        bias=False,
+        separable=True,
+        f_stride=1,
+        lookahead=0,
+        num_layers=3,
+    )
+    spec = torch.randn(size=(1, 1, 200, 64), dtype=torch.float32)
+    # spec shape: [batch_size, 1, time_steps, hidden_size]
+    output = causal.forward(spec)
+    print(output.shape)
+    output = causal.forward_chunk_by_chunk(spec)
+    print(output.shape)
+    return
+if __name__ == '__main__':
+    main()

toolbox/torchaudio/models/nx_clean_unet/configuration_nx_clean_unet.py CHANGED Viewed

@@ -20,6 +20,15 @@ class NXCleanUNetConfig(PretrainedConfig):
                  down_sampling_kernel_size: int = 4,
                  down_sampling_stride: int = 2,
                  tsfm_hidden_size: int = 256,
                  tsfm_attention_heads: int = 4,
                  tsfm_num_blocks: int = 6,
@@ -56,6 +65,15 @@ class NXCleanUNetConfig(PretrainedConfig):
         self.down_sampling_kernel_size = down_sampling_kernel_size
         self.down_sampling_stride = down_sampling_stride
         self.tsfm_hidden_size = tsfm_hidden_size
         self.tsfm_attention_heads = tsfm_attention_heads
         self.tsfm_num_blocks = tsfm_num_blocks

                  down_sampling_kernel_size: int = 4,
                  down_sampling_stride: int = 2,
+                 causal_in_channels: int = 64,
+                 causal_out_channels: int = 64,
+                 causal_kernel_size: int = 3,
+                 causal_bias: bool = False,
+                 causal_separable: bool = True,
+                 causal_f_stride: int = 1,
+                 # causal_lookahead: int = 0,
+                 causal_num_layers: int = 3,
                  tsfm_hidden_size: int = 256,
                  tsfm_attention_heads: int = 4,
                  tsfm_num_blocks: int = 6,
         self.down_sampling_kernel_size = down_sampling_kernel_size
         self.down_sampling_stride = down_sampling_stride
+        self.causal_in_channels = causal_in_channels
+        self.causal_out_channels = causal_out_channels
+        self.causal_kernel_size = causal_kernel_size
+        self.causal_bias = causal_bias
+        self.causal_separable = causal_separable
+        self.causal_f_stride = causal_f_stride
+        # self.causal_lookahead = causal_lookahead
+        self.causal_num_layers = causal_num_layers
         self.tsfm_hidden_size = tsfm_hidden_size
         self.tsfm_attention_heads = tsfm_attention_heads
         self.tsfm_num_blocks = tsfm_num_blocks

toolbox/torchaudio/models/nx_clean_unet/enhanced_audio.wav CHANGED Viewed

Binary files a/toolbox/torchaudio/models/nx_clean_unet/enhanced_audio.wav and b/toolbox/torchaudio/models/nx_clean_unet/enhanced_audio.wav differ

toolbox/torchaudio/models/nx_clean_unet/inference_nx_clean_unet.py CHANGED Viewed

@@ -62,6 +62,7 @@ class InferenceNXCleanUNet(object):
         with torch.no_grad():
             enhanced_audios = self.model.forward_chunk_by_chunk(noisy_audios)
             # enhanced_audio shape: [batch_size, n_samples]
             # enhanced_audios = torch.squeeze(enhanced_audios, dim=1)
@@ -70,16 +71,16 @@ class InferenceNXCleanUNet(object):
         return enhanced_audio
 def main():
-    model_zip_file = project_path / "trained_models/nx-clean-unet-44-epoch.zip"
     infer_nx_clean_unet = InferenceNXCleanUNet(model_zip_file)
     sample_rate = 8000
-    noisy_audio_file = project_path / "data/examples/ai_agent/dfaaf264-b5e3-4ca2-b5cb-5b6d637d962d_section_3.wav"
     noisy_audio, _ = librosa.load(
         noisy_audio_file.as_posix(),
         sr=sample_rate,
     )
-    # noisy_audio = noisy_audio[int(7*sample_rate):int(9*sample_rate)]
     noisy_audio = torch.tensor(noisy_audio, dtype=torch.float32)
     noisy_audio = noisy_audio.unsqueeze(dim=0)

         with torch.no_grad():
             enhanced_audios = self.model.forward_chunk_by_chunk(noisy_audios)
+            # enhanced_audios = self.model.forward(noisy_audios)
             # enhanced_audio shape: [batch_size, n_samples]
             # enhanced_audios = torch.squeeze(enhanced_audios, dim=1)
         return enhanced_audio
 def main():
+    model_zip_file = project_path / "trained_models/nx-clean-unet-14-epoch.zip"
     infer_nx_clean_unet = InferenceNXCleanUNet(model_zip_file)
     sample_rate = 8000
+    noisy_audio_file = project_path / "data/examples/ai_agent/dfaaf264-b5e3-4ca2-b5cb-5b6d637d962d_section_1.wav"
     noisy_audio, _ = librosa.load(
         noisy_audio_file.as_posix(),
         sr=sample_rate,
     )
+    noisy_audio = noisy_audio[int(7*sample_rate):int(9*sample_rate)]
     noisy_audio = torch.tensor(noisy_audio, dtype=torch.float32)
     noisy_audio = noisy_audio.unsqueeze(dim=0)

toolbox/torchaudio/models/nx_clean_unet/modeling_nx_clean_unet.py CHANGED Viewed

@@ -11,6 +11,7 @@ from torch.nn import functional as F
 from toolbox.torchaudio.configuration_utils import CONFIG_FILE
 from toolbox.torchaudio.models.nx_clean_unet.configuration_nx_clean_unet import NXCleanUNetConfig
 from toolbox.torchaudio.models.nx_clean_unet.transformer.transformer import TransformerEncoder
 class DownSamplingBlock(nn.Module):
@@ -166,6 +167,16 @@ class NXCleanUNet(nn.Module):
             kernel_size=config.down_sampling_kernel_size,
             stride=config.down_sampling_stride,
         )
         self.transformer = TransformerEncoder(
             input_size=config.down_sampling_hidden_channels,
             hidden_size=config.tsfm_hidden_size,

 from toolbox.torchaudio.configuration_utils import CONFIG_FILE
 from toolbox.torchaudio.models.nx_clean_unet.configuration_nx_clean_unet import NXCleanUNetConfig
 from toolbox.torchaudio.models.nx_clean_unet.transformer.transformer import TransformerEncoder
+from toolbox.torchaudio.models.nx_clean_unet.causal_convolution.causal_conv2d import CausalConv2dEncoder
 class DownSamplingBlock(nn.Module):
             kernel_size=config.down_sampling_kernel_size,
             stride=config.down_sampling_stride,
         )
+        self.causal_encoder = CausalConv2dEncoder(
+            in_channels=config.causal_in_channels,
+            out_channels=config.causal_out_channels,
+            kernel_size=config.causal_kernel_size,
+            bias=config.causal_bias,
+            separable=config.causal_separable,
+            f_stride=config.causal_f_stride,
+            lookahead=0,
+            num_layers=config.causal_num_layers,
+        )
         self.transformer = TransformerEncoder(
             input_size=config.down_sampling_hidden_channels,
             hidden_size=config.tsfm_hidden_size,

toolbox/torchaudio/models/nx_clean_unet/transformer/attention.py CHANGED Viewed

@@ -245,7 +245,7 @@ class RelativeMultiHeadSelfAttention(nn.Module):
 def main():
-    rel_attention = RelativeMultiHeadedAttention(n_head=4, n_feat=256, dropout_rate=0.1)
     x = torch.ones(size=(1, 200, 256), dtype=torch.float32)
     xt, new_cache = rel_attention.forward(x, x, x)

 def main():
+    rel_attention = RelativeMultiHeadSelfAttention(n_head=4, n_feat=256, dropout_rate=0.1)
     x = torch.ones(size=(1, 200, 256), dtype=torch.float32)
     xt, new_cache = rel_attention.forward(x, x, x)

toolbox/torchaudio/models/nx_clean_unet/transformer/transformer.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Dict, Optional, Tuple, List, Union
 import torch
 import torch.nn as nn
 from toolbox.torchaudio.models.nx_clean_unet.transformer.mask import subsequent_chunk_mask
 from toolbox.torchaudio.models.nx_clean_unet.transformer.attention import MultiHeadSelfAttention, RelativeMultiHeadSelfAttention
@@ -69,7 +70,7 @@ class TransformerEncoderLayer(nn.Module):
     def forward(
             self,
             x: torch.Tensor,
-            mask: torch.Tensor,
             attention_cache: torch.Tensor = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
@@ -175,40 +176,31 @@ class TransformerEncoder(nn.Module):
     def forward_chunk(self,
                       xs: torch.Tensor,
-                      offset: int,
-                      attention_mask: torch.Tensor = None,
                       attention_cache: torch.Tensor = None,
                       ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Forward just one chunk.
         :param xs: torch.Tensor. chunk input, with shape (b=1, time, mel-dim),
                 where `time == (chunk_size - 1) * subsample_rate + subsample.right_context + 1`
-        :param offset: int. current offset in encoder output timestamp.
-        :param attention_mask:
-        :param attention_cache: torch.Tensor. cache tensor for KEY & VALUE in
-                transformer/conformer attention, with shape
-                (elayers, head, cache_t1, d_k * 2), where
-                `head * d_k == hidden-dim` and
-                `cache_t1 == chunk_size * num_decoding_left_chunks`.
         :return:
         """
         # xs shape: [batch_size, time_steps, input_size]
         xs = self.input_linear.forward(xs)
         # xs shape: [batch_size, time_steps, hidden_size]
-        xs, position_embedding = self.positional_encoding.forward(xs, offset=offset)
-        # xs shape: [batch_size, time_steps, hidden_size]
-        # position_embedding shape: [1, time_steps, hidden_size]
         r_att_cache = []
         for idx, encoder_layer in enumerate(self.encoder_layer_list):
             xs, new_att_cache = encoder_layer.forward(
-                x=xs, mask=attention_mask,
-                position_embedding=position_embedding,
-                attention_cache=attention_cache[idx: idx+1],
             )
-            r_att_cache.append(new_att_cache[:, :, self.chunk_size:, :])
-            # r_att_cache.append(new_att_cache)
         r_att_cache = torch.cat(r_att_cache, dim=0)
@@ -221,25 +213,28 @@ class TransformerEncoder(nn.Module):
         batch_size, time_steps, _ = xs.shape
-        # [num_blocks, attention_heads, num_left_chunks, dim]
-        # attention_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device)
-        attention_cache: torch.Tensor = torch.zeros((6, 8, 128, 256),  device=xs.device)
-        attention_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool, device=xs.device)
         outputs = []
-        for idx in range(0, time_steps - self.chunk_size + 1, self.chunk_size):
-            begin = idx * self.chunk_size
-            end = begin + self.chunk_size
             chunk_xs = xs[:, begin:end, :]
             ys, attention_cache = self.forward_chunk(
-                xs=chunk_xs, attention_mask=attention_mask,
-                offset=0, attention_cache=attention_cache
             )
-            # xs shape: [batch_size, chunk_size, hidden_size]
             ys = self.output_linear.forward(ys)
-            # xs shape: [batch_size, chunk_size, input_size]
             outputs.append(ys)

 import torch
 import torch.nn as nn
+from fontTools.subset import prune_post_subset
 from toolbox.torchaudio.models.nx_clean_unet.transformer.mask import subsequent_chunk_mask
 from toolbox.torchaudio.models.nx_clean_unet.transformer.attention import MultiHeadSelfAttention, RelativeMultiHeadSelfAttention
     def forward(
             self,
             x: torch.Tensor,
+            mask: torch.Tensor = None,
             attention_cache: torch.Tensor = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
     def forward_chunk(self,
                       xs: torch.Tensor,
+                      max_att_cache_length: int,
                       attention_cache: torch.Tensor = None,
                       ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Forward just one chunk.
         :param xs: torch.Tensor. chunk input, with shape (b=1, time, mel-dim),
                 where `time == (chunk_size - 1) * subsample_rate + subsample.right_context + 1`
+        :param max_att_cache_length:
+        :param attention_cache: torch.Tensor.
         :return:
         """
         # xs shape: [batch_size, time_steps, input_size]
         xs = self.input_linear.forward(xs)
         # xs shape: [batch_size, time_steps, hidden_size]
         r_att_cache = []
         for idx, encoder_layer in enumerate(self.encoder_layer_list):
             xs, new_att_cache = encoder_layer.forward(
+                x=xs, attention_cache=attention_cache[idx: idx+1] if attention_cache is not None else None,
             )
+            if new_att_cache.size(2) > max_att_cache_length:
+                begin = (self.num_left_chunks + self.num_right_chunks) * self.chunk_size
+                end = self.num_right_chunks * self.chunk_size
+                new_att_cache = new_att_cache[:, :, -begin:-end, :]
+            r_att_cache.append(new_att_cache)
         r_att_cache = torch.cat(r_att_cache, dim=0)
         batch_size, time_steps, _ = xs.shape
+        # attention_cache shape: [num_blocks, attention_heads, self.num_left_chunks * self.chunk_size, n_heads * d_k * 2]
+        max_att_cache_length = (self.num_left_chunks + self.num_right_chunks) * self.chunk_size
+        attention_cache = None
         outputs = []
+        for idx in range(0, time_steps - self.chunk_size, self.chunk_size):
+            begin = idx
+            end = begin + self.chunk_size * (self.num_right_chunks + 1)
             chunk_xs = xs[:, begin:end, :]
+            # print(f"begin: {begin}, end: {end}, length: {chunk_xs.size(1)}")
             ys, attention_cache = self.forward_chunk(
+                xs=chunk_xs,
+                max_att_cache_length=max_att_cache_length,
+                attention_cache=attention_cache,
             )
+            # ys shape: [batch_size, self.chunk_size * (self.num_right_chunks + 1), hidden_size]
+            ys = ys[:, :self.chunk_size, :]
+            # ys shape: [batch_size, chunk_size, hidden_size]
             ys = self.output_linear.forward(ys)
+            # ys shape: [batch_size, chunk_size, input_size]
             outputs.append(ys)

toolbox/torchaudio/models/nx_clean_unet/yaml/config.yaml CHANGED Viewed

@@ -5,29 +5,38 @@ segment_size: 16000
 n_fft: 512
 win_size: 200
 hop_size: 80
 # 2**down_sampling_num_layers，
-# 例如 2**5=32 就意味着 32个值在降采样之后是一个时间步，
-# 则一步是 32/sample_rate = 0.004秒。
-# 那么 tsfm_chunk_size=4 则为16ms，tsfm_chunk_size=8 则为32ms
 # 假设每次向左看1秒，向右看30ms，则：
-# tsfm_chunk_size=1，tsfm_num_left_chunks=256，tsfm_num_right_chunks=8
-# tsfm_chunk_size=4，tsfm_num_left_chunks=64，tsfm_num_right_chunks=2
-# tsfm_chunk_size=8，tsfm_num_left_chunks=32，tsfm_num_right_chunks=1
-down_sampling_num_layers: 5
 down_sampling_in_channels: 1
 down_sampling_hidden_channels: 64
 down_sampling_kernel_size: 4
 down_sampling_stride: 2
 tsfm_hidden_size: 256
 tsfm_attention_heads: 8
 tsfm_num_blocks: 6
 tsfm_dropout_rate: 0.1
 tsfm_max_length: 512
-tsfm_chunk_size: 4
-tsfm_num_left_chunks: 64
-tsfm_num_right_chunks: 2
 discriminator_dim: 32
 discriminator_in_channel: 2

 n_fft: 512
 win_size: 200
 hop_size: 80
+# 因为 hop_size 取 80，则相当于 stft 的时间步是 10ms 一步，所以降采样也考虑到差不多的分辨率。
 # 2**down_sampling_num_layers，
+# 例如 2**6=64 就意味着 64 个值在降采样之后是一个时间步，
+# 则一步是 64/sample_rate = 0.008秒。
+# 那么 tsfm_chunk_size=2 则为16ms，tsfm_chunk_size=4 则为32ms
 # 假设每次向左看1秒，向右看30ms，则：
+# tsfm_chunk_size=1，tsfm_num_left_chunks=128，tsfm_num_right_chunks=4
+# tsfm_chunk_size=2，tsfm_num_left_chunks=64，tsfm_num_right_chunks=2
+# tsfm_chunk_size=4，tsfm_num_left_chunks=32，tsfm_num_right_chunks=1
+down_sampling_num_layers: 6
 down_sampling_in_channels: 1
 down_sampling_hidden_channels: 64
 down_sampling_kernel_size: 4
 down_sampling_stride: 2
+causal_in_channels: 64
+causal_out_channels: 64
+causal_kernel_size: 3
+causal_bias: false
+causal_separable: true
+causal_f_stride: 1
+causal_num_layers: 3
 tsfm_hidden_size: 256
 tsfm_attention_heads: 8
 tsfm_num_blocks: 6
 tsfm_dropout_rate: 0.1
 tsfm_max_length: 512
+tsfm_chunk_size: 1
+tsfm_num_left_chunks: 128
+tsfm_num_right_chunks: 4
 discriminator_dim: 32
 discriminator_in_channel: 2