Spaces:

qgyd2021
/

nx_denoise

Running

App Files Files Community

HoneyTian commited on Mar 6

Commit

ce96198

1 Parent(s): c6c50f4

update

Browse files

Files changed (5) hide show

examples/nx_mpnet/run.sh +5 -2
examples/nx_mpnet/yaml/config.yaml +3 -3
toolbox/torchaudio/models/nx_mpnet/inference_mpnet.py +103 -0
toolbox/torchaudio/models/nx_mpnet/modeling_nx_mpnet.py +92 -0
toolbox/torchaudio/models/nx_mpnet/transformers/transformers.py +12 -14

examples/nx_mpnet/run.sh CHANGED Viewed

@@ -3,10 +3,11 @@
 : <<'END'
-sh run.sh --stage 2 --stop_stage 2 --system_version centos --file_folder_name file_dir --final_model_name nx-mpnet-aishell-20250224 \
 --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
 --speech_dir "/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train" \
---max_epochs 100
 END
@@ -26,6 +27,7 @@ limit=10
 noise_dir=/data/tianxing/HuggingDatasets/nx_noise/data/noise
 speech_dir=/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train
 nohup_name=nohup.out
@@ -93,6 +95,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
   --speech_dir "${speech_dir}" \
   --train_dataset "${train_dataset}" \
   --valid_dataset "${valid_dataset}" \
 fi

 : <<'END'
+sh run.sh --stage 1 --stop_stage 2 --system_version centos --file_folder_name file_dir --final_model_name nx-mpnet-aishell-20250224 \
 --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise" \
 --speech_dir "/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train" \
+--max_epochs 100 \
+--duration 2 \
 END
 noise_dir=/data/tianxing/HuggingDatasets/nx_noise/data/noise
 speech_dir=/data/tianxing/HuggingDatasets/aishell/data_aishell/wav/train
+duration=2
 nohup_name=nohup.out
   --speech_dir "${speech_dir}" \
   --train_dataset "${train_dataset}" \
   --valid_dataset "${valid_dataset}" \
+  --duration "${duration}" \
 fi

examples/nx_mpnet/yaml/config.yaml CHANGED Viewed

@@ -15,9 +15,9 @@ mask_hidden_size: 64
 phase_num_blocks: 4
 phase_hidden_size: 64
-tsfm_hidden_size: 64
-tsfm_attention_heads: 4
-tsfm_num_blocks: 4
 tsfm_dropout_rate: 0.0
 tsfm_max_time_relative_position: 2048
 tsfm_max_freq_relative_position: 256

 phase_num_blocks: 4
 phase_hidden_size: 64
+tsfm_hidden_size: 128
+tsfm_attention_heads: 8
+tsfm_num_blocks: 6
 tsfm_dropout_rate: 0.0
 tsfm_max_time_relative_position: 2048
 tsfm_max_freq_relative_position: 256

toolbox/torchaudio/models/nx_mpnet/inference_mpnet.py ADDED Viewed

	@@ -0,0 +1,103 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import logging
+from pathlib import Path
+import shutil
+import tempfile
+import zipfile
+import librosa
+import numpy as np
+import torch
+import torchaudio
+from project_settings import project_path
+from toolbox.torchaudio.models.nx_mpnet.configuration_nx_mpnet import NXMPNetConfig
+from toolbox.torchaudio.models.nx_mpnet.modeling_nx_mpnet import NXMPNetPretrainedModel, MODEL_FILE
+from toolbox.torchaudio.models.nx_mpnet.utils import mag_pha_stft, mag_pha_istft
+logger = logging.getLogger("toolbox")
+class InferenceNXMPNet(object):
+    def __init__(self, pretrained_model_path_or_zip_file: str, device: str = "cpu"):
+        self.pretrained_model_path_or_zip_file = pretrained_model_path_or_zip_file
+        self.device = torch.device(device)
+        logger.info(f"loading model; model_file: {self.pretrained_model_path_or_zip_file}")
+        config, generator = self.load_models(self.pretrained_model_path_or_zip_file)
+        logger.info(f"model loading completed; model_file: {self.pretrained_model_path_or_zip_file}")
+        self.config = config
+        self.generator = generator
+        self.generator.to(device)
+        self.generator.eval()
+    def load_models(self, model_path: str):
+        model_path = Path(model_path)
+        if model_path.name.endswith(".zip"):
+            with zipfile.ZipFile(model_path.as_posix(), "r") as f_zip:
+                out_root = Path(tempfile.gettempdir()) / "nx_denoise"
+                out_root.mkdir(parents=True, exist_ok=True)
+                f_zip.extractall(path=out_root)
+            model_path = out_root / model_path.stem
+        config = NXMPNetConfig.from_pretrained(
+            pretrained_model_name_or_path=model_path.as_posix(),
+        )
+        generator = NXMPNetPretrainedModel.from_pretrained(
+            pretrained_model_name_or_path=model_path.as_posix(),
+        )
+        generator.to(self.device)
+        generator.eval()
+        shutil.rmtree(model_path)
+        return config, generator
+    def enhancement_by_tensor(self, noisy_audio: torch.Tensor) -> torch.Tensor:
+        if torch.max(noisy_audio) > 1 or torch.min(noisy_audio) < -1:
+            raise AssertionError(f"The value range of audio samples should be between -1 and 1.")
+        noisy_audio = noisy_audio.to(self.device)
+        with torch.no_grad():
+            noisy_mag, noisy_pha, noisy_com = mag_pha_stft(
+                noisy_audio,
+                self.config.n_fft, self.config.hop_size, self.config.win_size, self.config.compress_factor
+            )
+            # mag_g, pha_g, com_g = self.generator.forward(noisy_mag, noisy_pha)
+            mag_g, pha_g, com_g = self.generator.forward_chunk_by_chunk(noisy_mag, noisy_pha)
+            audio_g = mag_pha_istft(
+                mag_g, pha_g,
+                self.config.n_fft, self.config.hop_size, self.config.win_size, self.config.compress_factor
+            )
+            enhanced_audio = audio_g.detach()
+        enhanced_audio = enhanced_audio[0]
+        return enhanced_audio
+def main():
+    model_zip_file = project_path / "trained_models/mpnet-aishell-1-epoch.zip"
+    infer_mpnet = InferenceNXMPNet(model_zip_file)
+    sample_rate = 8000
+    noisy_audio_file = project_path / "data/examples/ai_agent/dfaaf264-b5e3-4ca2-b5cb-5b6d637d962d_section_1.wav"
+    noisy_audio, _ = librosa.load(
+        noisy_audio_file.as_posix(),
+        sr=sample_rate,
+    )
+    noisy_audio = noisy_audio[int(7*sample_rate):int(9*sample_rate)]
+    noisy_audio = torch.tensor(noisy_audio, dtype=torch.float32)
+    noisy_audio = noisy_audio.unsqueeze(dim=0)
+    enhanced_audio = infer_mpnet.enhancement_by_tensor(noisy_audio)
+    filename = "enhanced_audio.wav"
+    torchaudio.save(filename, enhanced_audio.detach().cpu(), sample_rate)
+    return
+if __name__ == '__main__':
+    main()

toolbox/torchaudio/models/nx_mpnet/modeling_nx_mpnet.py CHANGED Viewed

@@ -18,6 +18,8 @@ class NXMPNet(nn.Module):
                  config: NXMPNetConfig,
                  ):
         super(NXMPNet, self).__init__()
         self.dense_encoder = DenseEncoder(
             num_blocks=config.dense_num_blocks,
             in_channels=2,
@@ -73,6 +75,91 @@ class NXMPNet(nn.Module):
         return denoised_amp, denoised_pha, denoised_com
 MODEL_FILE = "generator.pt"
@@ -136,6 +223,11 @@ def main():
     print(denoised_amp.shape)
     print(denoised_pha.shape)
     print(denoised_com.shape)
     return

                  config: NXMPNetConfig,
                  ):
         super(NXMPNet, self).__init__()
+        self.config = config
         self.dense_encoder = DenseEncoder(
             num_blocks=config.dense_num_blocks,
             in_channels=2,
         return denoised_amp, denoised_pha, denoised_com
+    def forward_chunk(self,
+                      chunk_noisy_amp: torch.Tensor,
+                      chunk_noisy_pha: torch.Tensor,
+                      cache: dict,
+                      ):
+        dense_encoder_cache_pad_list = cache["dense_encoder_cache_pad_list"]
+        mask_decoder_cache_pad_list = cache["mask_decoder_cache_pad_list"]
+        phase_decoder_cache_pad_list = cache["phase_decoder_cache_pad_list"]
+        ts_transformer_cache_att_list = cache["ts_transformer_cache_att_list"]
+        max_att_cache_length = cache["max_att_cache_length"]
+        x = torch.stack((chunk_noisy_amp, chunk_noisy_pha), dim=-1).permute(0, 3, 2, 1)  # [B, 2, T, F]
+        # x shape: [b, 2, t, f]
+        x, new_dense_encoder_cache_pad_list = self.dense_encoder.forward_chunk(x, cache_pad_list=dense_encoder_cache_pad_list)
+        # x shape: [b, c, t, f//2]
+        x, new_ts_transformer_cache_att_list = self.ts_transformer.forward_chunk(
+            x,
+            max_att_cache_length=max_att_cache_length,
+            cache_att_list=ts_transformer_cache_att_list
+        )
+        # x shape: [b, c, t, f//2]
+        mask, new_mask_decoder_cache_pad_list = self.mask_decoder.forward_chunk(x, cache_pad_list=mask_decoder_cache_pad_list)
+        denoised_amp = chunk_noisy_amp * mask
+        denoised_pha, new_phase_decoder_cache_pad_list = self.phase_decoder.forward_chunk(x, cache_pad_list=phase_decoder_cache_pad_list)
+        denoised_com = torch.stack(
+            tensors=(
+                denoised_amp * torch.cos(denoised_pha),
+                denoised_amp * torch.sin(denoised_pha)
+            ),
+            dim=-1
+        )
+        cache = {
+            "dense_encoder_cache_pad_list": new_dense_encoder_cache_pad_list,
+            "mask_decoder_cache_pad_list": new_mask_decoder_cache_pad_list,
+            "phase_decoder_cache_pad_list": new_phase_decoder_cache_pad_list,
+            "ts_transformer_cache_att_list": new_ts_transformer_cache_att_list,
+            "max_att_cache_length": max_att_cache_length,
+        }
+        return denoised_amp, denoised_pha, denoised_com, cache
+    def forward_chunk_by_chunk(self,
+                               noisy_amp: torch.Tensor,
+                               noisy_pha: torch.Tensor,
+                               ):
+        """
+        :param noisy_amp: Tensor, shape: [b, f, t]
+        :param noisy_pha: Tensor, shape: [b, f, t]
+        :return:
+        """
+        b, f, t = noisy_amp.shape
+        max_att_cache_length = (self.config.tsfm_num_left_chunks + self.config.tsfm_num_right_chunks) * self.config.tsfm_chunk_size
+        cache = {
+            "dense_encoder_cache_pad_list": None,
+            "mask_decoder_cache_pad_list": None,
+            "phase_decoder_cache_pad_list": None,
+            "ts_transformer_cache_att_list": None,
+            "max_att_cache_length": max_att_cache_length,
+        }
+        denoised_amp_list = list()
+        denoised_pha_list = list()
+        denoised_com_list = list()
+        for idx in range(t):
+            chunk_noisy_amp = noisy_amp[:, :, idx:idx+1]
+            chunk_noisy_pha = noisy_pha[:, :, idx:idx+1]
+            denoised_amp, denoised_pha, denoised_com, cache = self.forward_chunk(chunk_noisy_amp, chunk_noisy_pha, cache)
+            denoised_amp_list.append(denoised_amp)
+            denoised_pha_list.append(denoised_pha)
+            denoised_com_list.append(denoised_com)
+        denoised_amp_list = torch.concat(denoised_amp_list, dim=2)
+        denoised_pha_list = torch.concat(denoised_pha_list, dim=2)
+        denoised_com_list = torch.concat(denoised_com_list, dim=2)
+        return denoised_amp_list, denoised_pha_list, denoised_com_list
 MODEL_FILE = "generator.pt"
     print(denoised_amp.shape)
     print(denoised_pha.shape)
     print(denoised_com.shape)
+    denoised_amp, denoised_pha, denoised_com = model.forward_chunk_by_chunk(noisy_amp, noisy_pha)
+    print(denoised_amp.shape)
+    print(denoised_pha.shape)
+    print(denoised_com.shape)
     return

toolbox/torchaudio/models/nx_mpnet/transformers/transformers.py CHANGED Viewed

@@ -361,13 +361,13 @@ class TSTransformerEncoder(nn.Module):
     def forward_chunk(self,
                       xs: torch.Tensor,
                       max_att_cache_length: int,
-                      attention_cache: torch.Tensor = None,
                       ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         :param xs:
         :param max_att_cache_length:
-        :param attention_cache: Tensor, shape: [num_layers, ...]
         :return:
         """
         # xs shape: [batch_size, channels, time_steps, input_size]
@@ -376,19 +376,17 @@ class TSTransformerEncoder(nn.Module):
         xs = xs.permute(0, 3, 2, 1)
         # xs shape: [batch_size, hidden_size, time_steps, input_size]
-        r_att_cache = []
         for idx, encoder_layer in enumerate(self.encoder_layer_list):
-            xs, new_att_cache = encoder_layer.forward(
-                x=xs, attention_cache=attention_cache[idx] if attention_cache is not None else None,
             )
             # new_att_cache shape: [b*f, n_heads, time_steps, dim]
-            if new_att_cache.size(2) > max_att_cache_length:
                 begin = (self.num_left_chunks + self.num_right_chunks) * self.chunk_size
                 end = self.num_right_chunks * self.chunk_size
-                new_att_cache = new_att_cache[:, :, -begin:-end, :]
-            r_att_cache.append(new_att_cache)
-        r_att_cache = torch.stack(r_att_cache, dim=0)
         # xs shape: [batch_size, hidden_size, time_steps, input_size]
         xs = xs.permute(0, 3, 2, 1)
@@ -396,7 +394,7 @@ class TSTransformerEncoder(nn.Module):
         xs = xs.permute(0, 3, 2, 1)
         # xs shape: [batch_size, channels, time_steps, input_size]
-        return xs, r_att_cache
     def forward_chunk_by_chunk(
             self,
@@ -406,7 +404,7 @@ class TSTransformerEncoder(nn.Module):
         batch_size, channels, time_steps, _ = xs.shape
         max_att_cache_length = (self.num_left_chunks + self.num_right_chunks) * self.chunk_size
-        attention_cache = None
         outputs = []
         for idx in range(0, time_steps, self.chunk_size):
@@ -415,10 +413,10 @@ class TSTransformerEncoder(nn.Module):
             chunk_xs = xs[:, :, begin:end, :]
             # chunk_xs shape: [batch_size, channels, self.chunk_size * (self.num_right_chunks + 1), input_size]
-            ys, attention_cache = self.forward_chunk(
                 xs=chunk_xs,
                 max_att_cache_length=max_att_cache_length,
-                attention_cache=attention_cache,
             )
             # ys shape: [batch_size, channels, self.chunk_size * (self.num_right_chunks + 1), input_size]
             ys = ys[:, :, :self.chunk_size, :]

     def forward_chunk(self,
                       xs: torch.Tensor,
                       max_att_cache_length: int,
+                      cache_att_list: List[torch.Tensor] = None,
                       ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         :param xs:
         :param max_att_cache_length:
+        :param cache_att_list: Tensor, shape: [num_layers, ...]
         :return:
         """
         # xs shape: [batch_size, channels, time_steps, input_size]
         xs = xs.permute(0, 3, 2, 1)
         # xs shape: [batch_size, hidden_size, time_steps, input_size]
+        new_cache_att_list = list()
         for idx, encoder_layer in enumerate(self.encoder_layer_list):
+            xs, new_cache_att = encoder_layer.forward(
+                x=xs, attention_cache=cache_att_list[idx] if cache_att_list is not None else None,
             )
             # new_att_cache shape: [b*f, n_heads, time_steps, dim]
+            if new_cache_att.size(2) > max_att_cache_length:
                 begin = (self.num_left_chunks + self.num_right_chunks) * self.chunk_size
                 end = self.num_right_chunks * self.chunk_size
+                new_cache_att = new_cache_att[:, :, -begin:-end, :]
+            new_cache_att_list.append(new_cache_att)
         # xs shape: [batch_size, hidden_size, time_steps, input_size]
         xs = xs.permute(0, 3, 2, 1)
         xs = xs.permute(0, 3, 2, 1)
         # xs shape: [batch_size, channels, time_steps, input_size]
+        return xs, new_cache_att_list
     def forward_chunk_by_chunk(
             self,
         batch_size, channels, time_steps, _ = xs.shape
         max_att_cache_length = (self.num_left_chunks + self.num_right_chunks) * self.chunk_size
+        cache_att_list = None
         outputs = []
         for idx in range(0, time_steps, self.chunk_size):
             chunk_xs = xs[:, :, begin:end, :]
             # chunk_xs shape: [batch_size, channels, self.chunk_size * (self.num_right_chunks + 1), input_size]
+            ys, cache_att_list = self.forward_chunk(
                 xs=chunk_xs,
                 max_att_cache_length=max_att_cache_length,
+                cache_att_list=cache_att_list,
             )
             # ys shape: [batch_size, channels, self.chunk_size * (self.num_right_chunks + 1), input_size]
             ys = ys[:, :, :self.chunk_size, :]