Spaces:

qgyd2021
/

nx_denoise

Running

App Files Files Community

HoneyTian commited on 8 days ago

Commit

9192cea

1 Parent(s): f239cae

update

Browse files

Files changed (9) hide show

examples/dfnet/step_2_train_model.py +4 -13
examples/dfnet/yaml/config.yaml +1 -1
examples/dtln/yaml/config.yaml +2 -2
main.py +9 -3
toolbox/torchaudio/models/dfnet/inference_dfnet.py +115 -0
toolbox/torchaudio/models/dfnet/yaml/config.yaml +74 -0
toolbox/torchaudio/modules/conv_stft.py +7 -14
toolbox/torchaudio/modules/utils/__init__.py +6 -0
toolbox/torchaudio/modules/utils/ema.py +12 -0

examples/dfnet/step_2_train_model.py CHANGED Viewed

@@ -187,18 +187,12 @@ def main():
     if last_step_idx != -1:
         logger.info(f"resume from steps-{last_step_idx}.")
         model_pt = serialization_dir / f"steps-{last_step_idx}/model.pt"
-        optimizer_pth = serialization_dir / f"steps-{last_step_idx}/optimizer.pth"
         logger.info(f"load state dict for model.")
         with open(model_pt.as_posix(), "rb") as f:
             state_dict = torch.load(f, map_location="cpu", weights_only=True)
         model.load_state_dict(state_dict, strict=True)
-        logger.info(f"load state dict for optimizer.")
-        with open(optimizer_pth.as_posix(), "rb") as f:
-            state_dict = torch.load(f, map_location="cpu", weights_only=True)
-        optimizer.load_state_dict(state_dict)
     if config.lr_scheduler == "CosineAnnealingLR":
         lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
             optimizer,
@@ -270,14 +264,14 @@ def main():
             clean_audios: torch.Tensor = clean_audios.to(device)
             noisy_audios: torch.Tensor = noisy_audios.to(device)
-            est_spec, est_wav, est_mask, lsnr = model.forward(noisy_audios)
             mr_stft_loss = mr_stft_loss_fn.forward(est_wav, clean_audios)
             neg_si_snr_loss = neg_si_snr_loss_fn.forward(est_wav, clean_audios)
             mask_loss = model.mask_loss_fn(est_mask, clean_audios, noisy_audios)
             lsnr_loss = model.lsnr_loss_fn(lsnr, clean_audios, noisy_audios)
-            loss = 1.0 * mr_stft_loss + 1.0 * neg_si_snr_loss + 1.0 * mask_loss + 1.0 * lsnr_loss
             if torch.any(torch.isnan(loss)) or torch.any(torch.isinf(loss)):
                 logger.info(f"find nan or inf in loss.")
                 continue
@@ -341,14 +335,14 @@ def main():
                         clean_audios: torch.Tensor = clean_audios.to(device)
                         noisy_audios: torch.Tensor = noisy_audios.to(device)
-                        est_spec, est_wav, est_mask, lsnr = model.forward(noisy_audios)
                         mr_stft_loss = mr_stft_loss_fn.forward(est_wav, clean_audios)
                         neg_si_snr_loss = neg_si_snr_loss_fn.forward(est_wav, clean_audios)
                         mask_loss = model.mask_loss_fn(est_mask, clean_audios, noisy_audios)
                         lsnr_loss = model.lsnr_loss_fn(lsnr, clean_audios, noisy_audios)
-                        loss = 1.0 * mr_stft_loss + 1.0 * neg_si_snr_loss + 1.0 * mask_loss + 1.0 * lsnr_loss
                         if torch.any(torch.isnan(loss)) or torch.any(torch.isinf(loss)):
                             logger.info(f"find nan or inf in loss.")
                             continue
@@ -410,9 +404,6 @@ def main():
                         model_to_delete: Path = model_list.pop(0)
                         shutil.rmtree(model_to_delete.as_posix())
-                    # save optim
-                    torch.save(optimizer.state_dict(), (save_dir / "optimizer.pth").as_posix())
                     # save metric
                     if best_metric is None:
                         best_epoch_idx = epoch_idx

     if last_step_idx != -1:
         logger.info(f"resume from steps-{last_step_idx}.")
         model_pt = serialization_dir / f"steps-{last_step_idx}/model.pt"
         logger.info(f"load state dict for model.")
         with open(model_pt.as_posix(), "rb") as f:
             state_dict = torch.load(f, map_location="cpu", weights_only=True)
         model.load_state_dict(state_dict, strict=True)
     if config.lr_scheduler == "CosineAnnealingLR":
         lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
             optimizer,
             clean_audios: torch.Tensor = clean_audios.to(device)
             noisy_audios: torch.Tensor = noisy_audios.to(device)
+            est_spec, est_wav, est_mask, lsnr, erb_encoder_h = model.forward(noisy_audios)
             mr_stft_loss = mr_stft_loss_fn.forward(est_wav, clean_audios)
             neg_si_snr_loss = neg_si_snr_loss_fn.forward(est_wav, clean_audios)
             mask_loss = model.mask_loss_fn(est_mask, clean_audios, noisy_audios)
             lsnr_loss = model.lsnr_loss_fn(lsnr, clean_audios, noisy_audios)
+            loss = 1.0 * mr_stft_loss + 1.0 * neg_si_snr_loss + 1.0 * mask_loss + 0.3 * lsnr_loss
             if torch.any(torch.isnan(loss)) or torch.any(torch.isinf(loss)):
                 logger.info(f"find nan or inf in loss.")
                 continue
                         clean_audios: torch.Tensor = clean_audios.to(device)
                         noisy_audios: torch.Tensor = noisy_audios.to(device)
+                        est_spec, est_wav, est_mask, lsnr, erb_encoder_h = model.forward(noisy_audios)
                         mr_stft_loss = mr_stft_loss_fn.forward(est_wav, clean_audios)
                         neg_si_snr_loss = neg_si_snr_loss_fn.forward(est_wav, clean_audios)
                         mask_loss = model.mask_loss_fn(est_mask, clean_audios, noisy_audios)
                         lsnr_loss = model.lsnr_loss_fn(lsnr, clean_audios, noisy_audios)
+                        loss = 1.0 * mr_stft_loss + 1.0 * neg_si_snr_loss + 1.0 * mask_loss + 0.3 * lsnr_loss
                         if torch.any(torch.isnan(loss)) or torch.any(torch.isinf(loss)):
                             logger.info(f"find nan or inf in loss.")
                             continue
                         model_to_delete: Path = model_list.pop(0)
                         shutil.rmtree(model_to_delete.as_posix())
                     # save metric
                     if best_metric is None:
                         best_epoch_idx = epoch_idx

examples/dfnet/yaml/config.yaml CHANGED Viewed

@@ -68,7 +68,7 @@ seed: 1234
 num_workers: 8
 batch_size: 64
-eval_steps: 20000
 # runtime
 use_post_filter: true

 num_workers: 8
 batch_size: 64
+eval_steps: 10000
 # runtime
 use_post_filter: true

examples/dtln/yaml/config.yaml CHANGED Viewed

@@ -24,6 +24,6 @@ max_epochs: 100
 clip_grad_norm: 10.0
 seed: 1234
-batch_size: 128
 num_workers: 4
-eval_steps: 25000

 clip_grad_norm: 10.0
 seed: 1234
+batch_size: 64
 num_workers: 4
+eval_steps: 15000

main.py CHANGED Viewed

@@ -62,10 +62,10 @@ def shell(cmd: str):
 denoise_engines = {
-    "mpnet-nx-speech": {
-        "infer_cls": InferenceMPNet,
         "kwargs": {
-            "pretrained_model_path_or_zip_file": (project_path / "trained_models/mpnet-nx-speech.zip").as_posix()
         }
     },
     "frcrn-dns3": {
@@ -74,6 +74,12 @@ denoise_engines = {
             "pretrained_model_path_or_zip_file": (project_path / "trained_models/frcrn-dns3.zip").as_posix()
         }
     },
 }

 denoise_engines = {
+    "dfnet-nx-dns3": {
+        "infer_cls": InferenceFRCRN,
         "kwargs": {
+            "pretrained_model_path_or_zip_file": (project_path / "trained_models/dfnet-nx-dns3.zip").as_posix()
         }
     },
     "frcrn-dns3": {
             "pretrained_model_path_or_zip_file": (project_path / "trained_models/frcrn-dns3.zip").as_posix()
         }
     },
+    "mpnet-nx-speech": {
+        "infer_cls": InferenceMPNet,
+        "kwargs": {
+            "pretrained_model_path_or_zip_file": (project_path / "trained_models/mpnet-nx-speech.zip").as_posix()
+        }
+    },
 }

toolbox/torchaudio/models/dfnet/inference_dfnet.py ADDED Viewed

	@@ -0,0 +1,115 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import logging
+from pathlib import Path
+import shutil
+import tempfile, time
+import zipfile
+import librosa
+import numpy as np
+import torch
+import torchaudio
+torch.set_num_threads(1)
+from project_settings import project_path
+from toolbox.torchaudio.models.dfnet.configuration_dfnet import DfNetConfig
+from toolbox.torchaudio.models.dfnet.modeling_dfnet import DfNetPretrainedModel, MODEL_FILE
+logger = logging.getLogger("toolbox")
+class InferenceDfNet(object):
+    def __init__(self, pretrained_model_path_or_zip_file: str, device: str = "cpu"):
+        self.pretrained_model_path_or_zip_file = pretrained_model_path_or_zip_file
+        self.device = torch.device(device)
+        logger.info(f"loading model; model_file: {self.pretrained_model_path_or_zip_file}")
+        config, model = self.load_models(self.pretrained_model_path_or_zip_file)
+        logger.info(f"model loading completed; model_file: {self.pretrained_model_path_or_zip_file}")
+        self.config = config
+        self.model = model
+        self.model.to(device)
+        self.model.eval()
+    def load_models(self, model_path: str):
+        model_path = Path(model_path)
+        if model_path.name.endswith(".zip"):
+            with zipfile.ZipFile(model_path.as_posix(), "r") as f_zip:
+                out_root = Path(tempfile.gettempdir()) / "nx_denoise"
+                out_root.mkdir(parents=True, exist_ok=True)
+                f_zip.extractall(path=out_root)
+            model_path = out_root / model_path.stem
+        config = DfNetConfig.from_pretrained(
+            pretrained_model_name_or_path=model_path.as_posix(),
+        )
+        model = DfNetPretrainedModel.from_pretrained(
+            pretrained_model_name_or_path=model_path.as_posix(),
+        )
+        model.to(self.device)
+        model.eval()
+        shutil.rmtree(model_path)
+        return config, model
+    def enhancement_by_ndarray(self, noisy_audio: np.ndarray) -> np.ndarray:
+        noisy_audio = torch.tensor(noisy_audio, dtype=torch.float32)
+        noisy_audio = noisy_audio.unsqueeze(dim=0)
+        # noisy_audio shape: [batch_size, n_samples]
+        enhanced_audio = self.enhancement_by_tensor(noisy_audio)
+        # enhanced_audio shape: [channels, num_samples]
+        enhanced_audio = enhanced_audio[0]
+        # enhanced_audio shape: [num_samples]
+        return enhanced_audio.cpu().numpy()
+    def enhancement_by_tensor(self, noisy_audio: torch.Tensor) -> torch.Tensor:
+        if torch.max(noisy_audio) > 1 or torch.min(noisy_audio) < -1:
+            raise AssertionError(f"The value range of audio samples should be between -1 and 1.")
+        # noisy_audio shape: [batch_size, num_samples]
+        noisy_audios = noisy_audio.to(self.device)
+        with torch.no_grad():
+            est_spec, est_wav, est_mask, lsnr = self.model.forward(noisy_audios)
+        # shape: [batch_size, num_samples]
+        enhanced_audio = torch.unsqueeze(est_wav, dim=1)
+        # shape: [batch_size, 1, num_samples]
+        enhanced_audio = enhanced_audio[0]
+        # shape: [channels, num_samples]
+        return enhanced_audio
+def main():
+    model_zip_file = project_path / "trained_models/dfnet-nx-dns3.zip"
+    infer_model = InferenceDfNet(model_zip_file)
+    sample_rate = 8000
+    noisy_audio_file = project_path / "data/examples/ai_agent/dfaaf264-b5e3-4ca2-b5cb-5b6d637d962d_section_3.wav"
+    noisy_audio, sample_rate = librosa.load(
+        noisy_audio_file.as_posix(),
+        sr=sample_rate,
+    )
+    duration = librosa.get_duration(y=noisy_audio, sr=sample_rate)
+    # noisy_audio = noisy_audio[int(7*sample_rate):int(9*sample_rate)]
+    noisy_audio = torch.tensor(noisy_audio, dtype=torch.float32)
+    noisy_audio = noisy_audio.unsqueeze(dim=0)
+    begin = time.time()
+    enhanced_audio = infer_model.enhancement_by_tensor(noisy_audio)
+    time_cost = time.time() - begin
+    print(f"enhanced_audio.shape: {enhanced_audio.shape}, time_cost: {time_cost:.4f}, audio_duration: {duration:.4f}, fpr: {time_cost / duration:.4f}")
+    filename = "enhanced_audio.wav"
+    torchaudio.save(filename, enhanced_audio.detach().cpu(), sample_rate)
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/models/dfnet/yaml/config.yaml ADDED Viewed

	@@ -0,0 +1,74 @@

+model_name: "dfnet"
+# spec
+sample_rate: 8000
+nfft: 512
+win_size: 200
+hop_size: 80
+spec_bins: 256
+# model
+conv_channels: 64
+conv_kernel_size_input:
+  - 3
+  - 3
+conv_kernel_size_inner:
+  - 1
+  - 3
+conv_lookahead: 0
+convt_kernel_size_inner:
+  - 1
+  - 3
+embedding_hidden_size: 256
+encoder_combine_op: "concat"
+encoder_emb_skip_op: "none"
+encoder_emb_linear_groups: 16
+encoder_emb_hidden_size: 256
+encoder_linear_groups: 32
+decoder_emb_num_layers: 3
+decoder_emb_skip_op: "none"
+decoder_emb_linear_groups: 16
+decoder_emb_hidden_size: 256
+df_decoder_hidden_size: 256
+df_num_layers: 2
+df_order: 5
+df_bins: 96
+df_gru_skip: "grouped_linear"
+df_decoder_linear_groups: 16
+df_pathway_kernel_size_t: 5
+df_lookahead: 2
+# lsnr
+n_frame: 3
+lsnr_max: 30
+lsnr_min: -15
+norm_tau: 1.
+# data
+min_snr_db: -10
+max_snr_db: 20
+# train
+lr: 0.001
+lr_scheduler: "CosineAnnealingLR"
+lr_scheduler_kwargs:
+  T_max: 250000
+  eta_min: 0.0001
+max_epochs: 100
+clip_grad_norm: 10.0
+seed: 1234
+num_workers: 8
+batch_size: 64
+eval_steps: 10000
+# runtime
+use_post_filter: true

toolbox/torchaudio/modules/conv_stft.py CHANGED Viewed

@@ -141,6 +141,7 @@ class ConviSTFT(nn.Module):
         # waveform = waveform / coff
         return waveform
     def forward_chunk(self,
                       spec: torch.Tensor,
                       waveform_cache: torch.Tensor = None,
@@ -163,22 +164,14 @@ class ConviSTFT(nn.Module):
         overlap_size = self.win_size - self.hop_size
         if waveform_cache is not None:
-            waveform_overlap = waveform_current[:, :, :overlap_size] + waveform_cache
-            waveform_non_overlap = waveform_current[:, :, overlap_size:-self.hop_size]
-            waveform_output = torch.cat(tensors=[waveform_overlap, waveform_non_overlap], dim=-1)
-            new_waveform_cache = waveform_current[:, :, -self.hop_size:]
-        else:
-            waveform_output = waveform_current[:, :, :-self.hop_size]
-            new_waveform_cache = waveform_current[:, :, -self.hop_size:]
         if coff_cache is not None:
-            coff_overlap = coff_current[:, :, :overlap_size] + coff_cache
-            coff_non_overlap = coff_current[:, :, overlap_size:-self.hop_size]
-            coff_output = torch.cat(tensors=[coff_overlap, coff_non_overlap], dim=-1)
-            new_coff_cache = coff_current[:, :, -self.hop_size:]
-        else:
-            coff_output = coff_current[:, :, :-self.hop_size]
-            new_coff_cache = coff_current[:, :, -self.hop_size:]
         waveform_output = waveform_output / (coff_output + 1e-8)
         return waveform_output, new_waveform_cache, new_coff_cache

         # waveform = waveform / coff
         return waveform
+    @torch.no_grad()
     def forward_chunk(self,
                       spec: torch.Tensor,
                       waveform_cache: torch.Tensor = None,
         overlap_size = self.win_size - self.hop_size
         if waveform_cache is not None:
+            waveform_current[:, :, :overlap_size] += waveform_cache
+        waveform_output = waveform_current[:, :, :self.hop_size]
+        new_waveform_cache = waveform_current[:, :, self.hop_size:]
         if coff_cache is not None:
+            coff_current[:, :, :overlap_size] += coff_cache
+        coff_output = coff_current[:, :, :self.hop_size]
+        new_coff_cache = coff_current[:, :, self.hop_size:]
         waveform_output = waveform_output / (coff_output + 1e-8)
         return waveform_output, new_waveform_cache, new_coff_cache

toolbox/torchaudio/modules/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == "__main__":
+    pass

toolbox/torchaudio/modules/utils/ema.py ADDED Viewed

	@@ -0,0 +1,12 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import torch.nn as nn
+class ExponentialMovingAverage(nn.Module):
+    def __init__(self):
+        super().__init__()
+if __name__ == "__main__":
+    pass