Spaces:

qgyd2021
/

nx_denoise

Running

App Files Files Community

HoneyTian commited on 14 days ago

Commit

85a1b16

1 Parent(s): 8c3c188

add microphone audio input

Browse files

Files changed (26) hide show

Dockerfile +3 -0
examples/dfnet/step_2_train_model.py +6 -0
examples/dfnet/yaml/config.yaml +1 -1
examples/dtln/step_2_train_model.py +4 -0
examples/dtln/yaml/config.yaml +14 -8
examples/frcrn/step_2_train_model.py +4 -0
examples/{simple_lstm_irm → lstm}/run.sh +0 -0
examples/{simple_lstm_irm → lstm}/step_1_prepare_data.py +0 -0
examples/lstm/step_2_train_model.py +476 -0
examples/{simple_lstm_irm → lstm}/step_3_evaluation.py +2 -2
examples/mpnet/step_2_train_model.py +4 -0
examples/simple_lstm_irm/step_2_train_model.py +0 -346
toolbox/torchaudio/models/dfnet/configuration_dfnet.py +4 -0
toolbox/torchaudio/models/dfnet/conv_stft.py +0 -148
toolbox/torchaudio/models/dfnet/modeling_dfnet.py +86 -89
toolbox/torchaudio/models/frcrn/conv_stft.py +2 -2
toolbox/torchaudio/models/{simple_lstm_irm → lstm}/__init__.py +0 -0
toolbox/torchaudio/models/lstm/configuration_lstm.py +73 -0
toolbox/torchaudio/models/lstm/modeling_lstm.py +260 -0
toolbox/torchaudio/models/lstm/yaml/config.yaml +32 -0
toolbox/torchaudio/models/simple_lstm_irm/configuration_simple_lstm_irm.py +0 -38
toolbox/torchaudio/models/simple_lstm_irm/modeling_simple_lstm_irm.py +0 -133
toolbox/torchaudio/models/simple_lstm_irm/yaml/config.yaml +0 -14
toolbox/torchaudio/modules/conv_stft.py +132 -13
toolbox/torchaudio/modules/freq_bands/__init__.py +6 -0
toolbox/torchaudio/modules/freq_bands/erb_bands.py +173 -0

Dockerfile CHANGED Viewed

@@ -4,6 +4,9 @@ WORKDIR /code
 COPY . /code
 RUN pip install --upgrade pip
 RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt

 COPY . /code
+RUN apt-get update
+RUN apt-get install -y ffmpeg build-essential
 RUN pip install --upgrade pip
 RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt

examples/dfnet/step_2_train_model.py CHANGED Viewed

@@ -15,6 +15,8 @@ import sys
 import shutil
 from typing import List
 pwd = os.path.abspath(os.path.dirname(__file__))
 sys.path.append(os.path.join(pwd, "../../"))
@@ -243,7 +245,11 @@ def main():
     step_idx = 0 if last_step_idx == -1 else last_step_idx
     logger.info("training")
     for epoch_idx in range(max(0, last_epoch+1), config.max_epochs):
         # train
         model.train()

 import shutil
 from typing import List
+from fontTools.varLib.plot import stops
 pwd = os.path.abspath(os.path.dirname(__file__))
 sys.path.append(os.path.join(pwd, "../../"))
     step_idx = 0 if last_step_idx == -1 else last_step_idx
     logger.info("training")
+    early_stop_flag = False
     for epoch_idx in range(max(0, last_epoch+1), config.max_epochs):
+        if early_stop_flag:
+            break
         # train
         model.train()

examples/dfnet/yaml/config.yaml CHANGED Viewed

@@ -68,7 +68,7 @@ seed: 1234
 num_workers: 8
 batch_size: 32
-eval_steps: 10000
 # runtime
 use_post_filter: true

 num_workers: 8
 batch_size: 32
+eval_steps: 25000
 # runtime
 use_post_filter: true

examples/dtln/step_2_train_model.py CHANGED Viewed

@@ -235,7 +235,11 @@ def main():
     step_idx = 0 if last_step_idx == -1 else last_step_idx
     logger.info("training")
     for epoch_idx in range(max(0, last_epoch+1), config.max_epochs):
         # train
         model.train()

     step_idx = 0 if last_step_idx == -1 else last_step_idx
     logger.info("training")
+    early_stop_flag = False
     for epoch_idx in range(max(0, last_epoch+1), config.max_epochs):
+        if early_stop_flag:
+            break
         # train
         model.train()

examples/dtln/yaml/config.yaml CHANGED Viewed

@@ -1,23 +1,29 @@
 model_name: "DTLN"
 sample_rate: 8000
 fft_size: 256
 hop_size: 128
 win_type: hann
 max_snr_db: 20
 min_snr_db: -10
 encoder_size: 256
-max_epochs: 100
-batch_size: 4
-num_workers: 4
-seed: 1234
-eval_steps: 25000
 lr: 0.001
-lr_scheduler: CosineAnnealingLR
-lr_scheduler_kwargs: {}
 clip_grad_norm: 10.0

 model_name: "DTLN"
+# spec
 sample_rate: 8000
 fft_size: 256
 hop_size: 128
 win_type: hann
+# data
 max_snr_db: 20
 min_snr_db: -10
+# model
 encoder_size: 256
+# train
 lr: 0.001
+lr_scheduler: "CosineAnnealingLR"
+lr_scheduler_kwargs:
+  T_max: 250000
+  eta_min: 0.0001
+max_epochs: 100
 clip_grad_norm: 10.0
+seed: 1234
+batch_size: 32
+num_workers: 4
+eval_steps: 25000

examples/frcrn/step_2_train_model.py CHANGED Viewed

@@ -238,7 +238,11 @@ def main():
     step_idx = 0 if last_step_idx == -1 else last_step_idx
     logger.info("training")
     for epoch_idx in range(max(0, last_epoch+1), config.max_epochs):
         # train
         model.train()

     step_idx = 0 if last_step_idx == -1 else last_step_idx
     logger.info("training")
+    early_stop_flag = False
     for epoch_idx in range(max(0, last_epoch+1), config.max_epochs):
+        if early_stop_flag:
+            break
         # train
         model.train()

examples/{simple_lstm_irm → lstm}/run.sh RENAMED Viewed

File without changes

examples/{simple_lstm_irm → lstm}/step_1_prepare_data.py RENAMED Viewed

File without changes

examples/lstm/step_2_train_model.py ADDED Viewed

	@@ -0,0 +1,476 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/WenzheLiu-Speech/awesome-speech-enhancement
+"""
+import argparse
+import json
+import logging
+from logging.handlers import TimedRotatingFileHandler
+import os
+import platform
+from pathlib import Path
+import random
+import sys
+import shutil
+from typing import List
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, "../../"))
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data.dataloader import DataLoader
+import torchaudio
+from tqdm import tqdm
+from toolbox.torch.utils.data.dataset.denoise_jsonl_dataset import DenoiseJsonlDataset
+from toolbox.torchaudio.metrics.pesq import run_pesq_score
+from toolbox.torchaudio.models.lstm.configuration_lstm import LstmConfig
+from toolbox.torchaudio.models.lstm.modeling_lstm import LstmPretrainedModel
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--train_dataset", default="train.jsonl", type=str)
+    parser.add_argument("--valid_dataset", default="valid.jsonl", type=str)
+    parser.add_argument("--max_epochs", default=100, type=int)
+    parser.add_argument("--batch_size", default=64, type=int)
+    parser.add_argument("--learning_rate", default=1e-3, type=float)
+    parser.add_argument("--num_serialized_models_to_keep", default=15, type=int)
+    parser.add_argument("--patience", default=10, type=int)
+    parser.add_argument("--serialization_dir", default="serialization_dir", type=str)
+    parser.add_argument("--seed", default=0, type=int)
+    parser.add_argument("--config_file", default="config.yaml", type=str)
+    args = parser.parse_args()
+    return args
+def logging_config(file_dir: str):
+    fmt = "%(asctime)s - %(name)s - %(levelname)s  %(filename)s:%(lineno)d >  %(message)s"
+    logging.basicConfig(format=fmt,
+                        datefmt="%m/%d/%Y %H:%M:%S",
+                        level=logging.INFO)
+    file_handler = TimedRotatingFileHandler(
+        filename=os.path.join(file_dir, "main.log"),
+        encoding="utf-8",
+        when="D",
+        interval=1,
+        backupCount=7
+    )
+    file_handler.setLevel(logging.INFO)
+    file_handler.setFormatter(logging.Formatter(fmt))
+    logger = logging.getLogger(__name__)
+    logger.addHandler(file_handler)
+    return logger
+class CollateFunction(object):
+    def __init__(self,
+                 n_fft: int = 512,
+                 win_length: int = 200,
+                 hop_length: int = 80,
+                 window_fn: str = "hamming",
+                 irm_beta: float = 1.0,
+                 epsilon: float = 1e-8,
+                 ):
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.window_fn = window_fn
+        self.irm_beta = irm_beta
+        self.epsilon = epsilon
+        self.stft_mag = torchaudio.transforms.Spectrogram(
+            n_fft=self.n_fft,
+            win_length=self.win_length,
+            hop_length=self.hop_length,
+            power=1.0,
+            window_fn=torch.hamming_window if window_fn == "hamming" else torch.hann_window,
+        )
+        self.stft_complex = torchaudio.transforms.Spectrogram(
+            n_fft=self.n_fft,
+            win_length=self.win_length,
+            hop_length=self.hop_length,
+            power=None,
+            window_fn=torch.hamming_window if window_fn == "hamming" else torch.hann_window,
+        )
+        self.istft = torchaudio.transforms.InverseSpectrogram(
+            n_fft=self.n_fft,
+            win_length=self.win_length,
+            hop_length=self.hop_length,
+            window_fn=torch.hamming_window if window_fn == "hamming" else torch.hann_window,
+        )
+    def __call__(self, batch: List[dict]):
+        mag_noisy_audios = list()
+        pha_noisy_audios = list()
+        irm_gth = list()
+        clean_audios = list()
+        for sample in batch:
+            noise_audio: torch.Tensor = sample["noise_wave"]
+            clean_audio: torch.Tensor = sample["speech_wave"]
+            noisy_audio: torch.Tensor = sample["mix_wave"]
+            snr_db: float = sample["snr_db"]
+            mag_noise = self.stft_mag.forward(noise_audio)
+            mag_clean = self.stft_mag.forward(clean_audio)
+            stft_noisy = self.stft_complex.forward(noisy_audio)
+            irm_clean = mag_clean / (mag_noise + mag_clean + self.epsilon)
+            irm_clean = torch.pow(irm_clean, self.irm_beta)
+            real = torch.real(stft_noisy)
+            imag = torch.imag(stft_noisy)
+            mag_noisy = torch.sqrt(real ** 2 + imag ** 2)
+            pha_noisy = torch.atan2(imag, real)
+            mag_noisy_audios.append(mag_noisy)
+            pha_noisy_audios.append(pha_noisy)
+            irm_gth.append(irm_clean)
+            clean_audios.append(clean_audio)
+        mag_noisy_audios = torch.stack(mag_noisy_audios)
+        pha_noisy_audios = torch.stack(pha_noisy_audios)
+        irm_gth = torch.stack(irm_gth)
+        clean_audios = torch.stack(clean_audios)
+        # assert
+        if torch.any(torch.isnan(mag_noisy_audios)):
+            raise AssertionError("nan in mag_noisy_audios Tensor")
+        if torch.any(torch.isnan(pha_noisy_audios)):
+            raise AssertionError("nan in pha_noisy_audios Tensor")
+        if torch.any(torch.isnan(irm_gth)):
+            raise AssertionError("nan in irm_gth Tensor")
+        if torch.any(torch.isnan(clean_audios)):
+            raise AssertionError("nan in clean_audios Tensor")
+        return mag_noisy_audios, pha_noisy_audios, irm_gth, clean_audios
+    def enhance(self, mag_noisy: torch.Tensor, pha_noisy: torch.Tensor, irm_speech: torch.Tensor):
+        mag_denoise = mag_noisy * irm_speech
+        stft_denoise = mag_denoise * torch.exp((1j * pha_noisy))
+        denoise = self.istft.forward(stft_denoise)
+        return denoise
+collate_fn = CollateFunction()
+def main():
+    args = get_args()
+    config = LstmConfig.from_pretrained(
+        pretrained_model_name_or_path=args.config_file,
+    )
+    serialization_dir = Path(args.serialization_dir)
+    serialization_dir.mkdir(parents=True, exist_ok=True)
+    logger = logging_config(serialization_dir)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    logger.info("set seed: {}".format(args.seed))
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    n_gpu = torch.cuda.device_count()
+    logger.info("GPU available count: {}; device: {}".format(n_gpu, device))
+    # datasets
+    logger.info("prepare datasets")
+    train_dataset = DenoiseJsonlDataset(
+        jsonl_file=args.train_dataset,
+        expected_sample_rate=config.sample_rate,
+        max_wave_value=32768.0,
+        min_snr_db=config.min_snr_db,
+        max_snr_db=config.max_snr_db,
+        # skip=225000,
+    )
+    valid_dataset = DenoiseJsonlDataset(
+        jsonl_file=args.valid_dataset,
+        expected_sample_rate=config.sample_rate,
+        max_wave_value=32768.0,
+        min_snr_db=config.min_snr_db,
+        max_snr_db=config.max_snr_db,
+    )
+    train_data_loader = DataLoader(
+        dataset=train_dataset,
+        batch_size=config.batch_size,
+        # shuffle=True,
+        sampler=None,
+        # Linux 系统中可以使用多个子进程加载数据, 而在 Windows 系统中不能.
+        num_workers=0 if platform.system() == "Windows" else os.cpu_count() // 2,
+        collate_fn=collate_fn,
+        pin_memory=False,
+        prefetch_factor=None if platform.system() == "Windows" else 2,
+    )
+    valid_data_loader = DataLoader(
+        dataset=valid_dataset,
+        batch_size=config.batch_size,
+        # shuffle=True,
+        sampler=None,
+        # Linux 系统中可以使用多个子进程加载数据, 而在 Windows 系统中不能.
+        num_workers=0 if platform.system() == "Windows" else os.cpu_count() // 2,
+        collate_fn=collate_fn,
+        pin_memory=False,
+        prefetch_factor=None if platform.system() == "Windows" else 2,
+    )
+    # models
+    logger.info(f"prepare models. config_file: {args.config_file}")
+    model = LstmPretrainedModel(
+        config=config,
+    )
+    model.to(device)
+    model.train()
+    # optimizer
+    logger.info("prepare optimizer, lr_scheduler, loss_fn, evaluation_metric")
+    optimizer = torch.optim.AdamW(model.parameters(), config.lr)
+    # resume training
+    last_step_idx = -1
+    last_epoch = -1
+    for step_idx_str in serialization_dir.glob("steps-*"):
+        step_idx_str = Path(step_idx_str)
+        step_idx = step_idx_str.stem.split("-")[1]
+        step_idx = int(step_idx)
+        if step_idx > last_step_idx:
+            last_step_idx = step_idx
+    # last_epoch = 1
+    if last_step_idx != -1:
+        logger.info(f"resume from steps-{last_step_idx}.")
+        model_pt = serialization_dir / f"steps-{last_step_idx}/model.pt"
+        optimizer_pth = serialization_dir / f"steps-{last_step_idx}/optimizer.pth"
+        logger.info(f"load state dict for model.")
+        with open(model_pt.as_posix(), "rb") as f:
+            state_dict = torch.load(f, map_location="cpu", weights_only=True)
+        model.load_state_dict(state_dict, strict=True)
+        logger.info(f"load state dict for optimizer.")
+        with open(optimizer_pth.as_posix(), "rb") as f:
+            state_dict = torch.load(f, map_location="cpu", weights_only=True)
+        optimizer.load_state_dict(state_dict)
+    if config.lr_scheduler == "CosineAnnealingLR":
+        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+            optimizer,
+            last_epoch=last_epoch,
+            # T_max=10 * config.eval_steps,
+            # eta_min=0.01 * config.lr,
+            **config.lr_scheduler_kwargs,
+        )
+    elif config.lr_scheduler == "MultiStepLR":
+        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
+            optimizer,
+            last_epoch=last_epoch,
+            milestones=[10000, 20000, 30000, 40000, 50000], gamma=0.5
+        )
+    else:
+        raise AssertionError(f"invalid lr_scheduler: {config.lr_scheduler}")
+    mse_loss_fn = nn.MSELoss(
+        reduction="mean",
+    ).to(device)
+    # training loop
+    logger.info("training")
+    average_pesq_score = 1000000000
+    average_loss = 1000000000
+    model_list = list()
+    best_epoch_idx = None
+    best_step_idx = None
+    best_metric = None
+    patience_count = 0
+    step_idx = 0 if last_step_idx == -1 else last_step_idx
+    logger.info("training")
+    early_stop_flag = False
+    for epoch_idx in range(max(0, last_epoch+1), config.max_epochs):
+        if early_stop_flag:
+            break
+        # train
+        model.train()
+        total_pesq_score = 0.
+        total_loss = 0.
+        total_batches = 0.
+        progress_bar_train = tqdm(
+            initial=step_idx,
+            desc="Training; epoch: {}".format(epoch_idx),
+        )
+        for train_batch in train_data_loader:
+            mag_noisy_audios, pha_noisy_audios, irm_gth, clean_audios = train_batch
+            mag_noisy_audios = mag_noisy_audios.to(device)
+            pha_noisy_audios = pha_noisy_audios.to(device)
+            irm_gth = irm_gth.to(device)
+            clean_audios = clean_audios.to(device)
+            irm = model.forward(mag_noisy_audios)
+            denoise_audios = collate_fn.enhance(mag_noisy_audios, pha_noisy_audios, irm)
+            loss = mse_loss_fn.forward(irm, irm_gth)
+            denoise_audios_list_r = list(denoise_audios.detach().cpu().numpy())
+            clean_audios_list_r = list(clean_audios.detach().cpu().numpy())
+            pesq_score = run_pesq_score(clean_audios_list_r, denoise_audios_list_r, sample_rate=config.sample_rate, mode="nb")
+            optimizer.zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.clip_grad_norm)
+            optimizer.step()
+            lr_scheduler.step()
+            total_pesq_score += pesq_score
+            total_loss += loss.item()
+            total_batches += 1
+            average_pesq_score = round(total_pesq_score / total_batches, 4)
+            average_loss = round(total_loss / total_batches, 4)
+            progress_bar_train.update(1)
+            progress_bar_train.set_postfix({
+                "lr": lr_scheduler.get_last_lr()[0],
+                "pesq_score": average_pesq_score,
+                "loss": average_loss,
+            })
+            # evaluation
+            step_idx += 1
+            if step_idx % config.eval_steps == 0:
+                with torch.no_grad():
+                    torch.cuda.empty_cache()
+                    total_pesq_score = 0.
+                    total_loss = 0.
+                    total_batches = 0.
+                    progress_bar_train.close()
+                    progress_bar_eval = tqdm(
+                        desc="Evaluation; steps-{}k".format(int(step_idx / 1000)),
+                    )
+                    for eval_batch in valid_data_loader:
+                        mag_noisy_audios, pha_noisy_audios, irm_gth, clean_audios = eval_batch
+                        mag_noisy_audios = mag_noisy_audios.to(device)
+                        pha_noisy_audios = pha_noisy_audios.to(device)
+                        irm_gth = irm_gth.to(device)
+                        clean_audios = clean_audios.to(device)
+                        with torch.no_grad():
+                            irm = model.forward(mag_noisy_audios)
+                            denoise_audios = collate_fn.enhance(mag_noisy_audios, pha_noisy_audios, irm)
+                            loss = mse_loss_fn.forward(irm, irm_gth)
+                        denoise_audios_list_r = list(denoise_audios.detach().cpu().numpy())
+                        clean_audios_list_r = list(clean_audios.detach().cpu().numpy())
+                        pesq_score = run_pesq_score(clean_audios_list_r, denoise_audios_list_r, sample_rate=config.sample_rate, mode="nb")
+                        optimizer.zero_grad()
+                        loss.backward()
+                        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.clip_grad_norm)
+                        optimizer.step()
+                        lr_scheduler.step()
+                        total_pesq_score += pesq_score
+                        total_loss += loss.item()
+                        total_batches += 1
+                        average_pesq_score = round(total_pesq_score / total_batches, 4)
+                        average_loss = round(total_loss / total_batches, 4)
+                        progress_bar_eval.update(1)
+                        progress_bar_eval.set_postfix({
+                            "lr": lr_scheduler.get_last_lr()[0],
+                            "pesq_score": average_pesq_score,
+                            "loss": average_loss,
+                        })
+                    total_pesq_score = 0.
+                    total_loss = 0.
+                    total_batches = 0.
+                    progress_bar_eval.close()
+                    progress_bar_train = tqdm(
+                        initial=progress_bar_train.n,
+                        postfix=progress_bar_train.postfix,
+                        desc=progress_bar_train.desc,
+                    )
+                    # save path
+                    epoch_dir = serialization_dir / "epoch-{}".format(epoch_idx)
+                    epoch_dir.mkdir(parents=True, exist_ok=False)
+                    # save models
+                    model.save_pretrained(epoch_dir.as_posix())
+                    model_list.append(epoch_dir)
+                    if len(model_list) >= args.num_serialized_models_to_keep:
+                        model_to_delete: Path = model_list.pop(0)
+                        shutil.rmtree(model_to_delete.as_posix())
+                    # save metric
+                    if best_metric is None:
+                        best_epoch_idx = epoch_idx
+                        best_step_idx = step_idx
+                        best_metric = average_pesq_score
+                    elif average_pesq_score >= best_metric:
+                        # great is better.
+                        best_epoch_idx = epoch_idx
+                        best_step_idx = step_idx
+                        best_metric = average_pesq_score
+                    else:
+                        pass
+                    metrics = {
+                        "epoch_idx": epoch_idx,
+                        "best_epoch_idx": best_epoch_idx,
+                        "best_step_idx": best_step_idx,
+                        "pesq_score": average_pesq_score,
+                        "loss": average_loss,
+                    }
+                    metrics_filename = epoch_dir / "metrics_epoch.json"
+                    with open(metrics_filename, "w", encoding="utf-8") as f:
+                        json.dump(metrics, f, indent=4, ensure_ascii=False)
+                    # save best
+                    best_dir = serialization_dir / "best"
+                    if best_epoch_idx == epoch_idx:
+                        if best_dir.exists():
+                            shutil.rmtree(best_dir)
+                        shutil.copytree(epoch_dir, best_dir)
+                    # early stop
+                    early_stop_flag = False
+                    if best_epoch_idx == epoch_idx and best_step_idx == step_idx:
+                        patience_count = 0
+                    else:
+                        patience_count += 1
+                    if patience_count >= args.patience:
+                        early_stop_flag = True
+                    # early stop
+                    if early_stop_flag:
+                        break
+    return
+if __name__ == '__main__':
+    main()

examples/{simple_lstm_irm → lstm}/step_3_evaluation.py RENAMED Viewed

@@ -19,7 +19,7 @@ import torch.nn as nn
 import torchaudio
 from tqdm import tqdm
-from toolbox.torchaudio.models.simple_lstm_irm.modeling_simple_lstm_irm import SimpleLstmIRMPretrainedModel
 def get_args():
@@ -147,7 +147,7 @@ def main():
     logger.info("GPU available count: {}; device: {}".format(n_gpu, device))
     logger.info("prepare model")
-    model = SimpleLstmIRMPretrainedModel.from_pretrained(
         pretrained_model_name_or_path=args.model_dir,
     )
     model.to(device)

 import torchaudio
 from tqdm import tqdm
+from toolbox.torchaudio.models.lstm.modeling_lstm import LstmPretrainedModel
 def get_args():
     logger.info("GPU available count: {}; device: {}".format(n_gpu, device))
     logger.info("prepare model")
+    model = LstmPretrainedModel.from_pretrained(
         pretrained_model_name_or_path=args.model_dir,
     )
     model.to(device)

examples/mpnet/step_2_train_model.py CHANGED Viewed

@@ -225,7 +225,11 @@ def main():
     patience_count = 0
     logger.info("training")
     for idx_epoch in range(max(0, last_epoch+1), args.max_epochs):
         # train
         generator.train()
         discriminator.train()

     patience_count = 0
     logger.info("training")
+    early_stop_flag = False
     for idx_epoch in range(max(0, last_epoch+1), args.max_epochs):
+        if early_stop_flag:
+            break
         # train
         generator.train()
         discriminator.train()

examples/simple_lstm_irm/step_2_train_model.py DELETED Viewed

@@ -1,346 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-"""
-https://github.com/WenzheLiu-Speech/awesome-speech-enhancement
-"""
-import argparse
-import json
-import logging
-from logging.handlers import TimedRotatingFileHandler
-import os
-import platform
-from pathlib import Path
-import random
-import sys
-import shutil
-from typing import List
-pwd = os.path.abspath(os.path.dirname(__file__))
-sys.path.append(os.path.join(pwd, "../../"))
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.utils.data.dataloader import DataLoader
-import torchaudio
-from tqdm import tqdm
-from toolbox.torch.utils.data.dataset.denoise_excel_dataset import DenoiseExcelDataset
-from toolbox.torchaudio.models.simple_lstm_irm.configuration_simple_lstm_irm import SimpleLstmIRMConfig
-from toolbox.torchaudio.models.simple_lstm_irm.modeling_simple_lstm_irm import SimpleLstmIRMPretrainedModel
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--train_dataset", default="train.xlsx", type=str)
-    parser.add_argument("--valid_dataset", default="valid.xlsx", type=str)
-    parser.add_argument("--max_epochs", default=100, type=int)
-    parser.add_argument("--batch_size", default=64, type=int)
-    parser.add_argument("--learning_rate", default=1e-3, type=float)
-    parser.add_argument("--num_serialized_models_to_keep", default=10, type=int)
-    parser.add_argument("--patience", default=5, type=int)
-    parser.add_argument("--serialization_dir", default="serialization_dir", type=str)
-    parser.add_argument("--seed", default=0, type=int)
-    parser.add_argument("--config_file", default="config.yaml", type=str)
-    args = parser.parse_args()
-    return args
-def logging_config(file_dir: str):
-    fmt = "%(asctime)s - %(name)s - %(levelname)s  %(filename)s:%(lineno)d >  %(message)s"
-    logging.basicConfig(format=fmt,
-                        datefmt="%m/%d/%Y %H:%M:%S",
-                        level=logging.INFO)
-    file_handler = TimedRotatingFileHandler(
-        filename=os.path.join(file_dir, "main.log"),
-        encoding="utf-8",
-        when="D",
-        interval=1,
-        backupCount=7
-    )
-    file_handler.setLevel(logging.INFO)
-    file_handler.setFormatter(logging.Formatter(fmt))
-    logger = logging.getLogger(__name__)
-    logger.addHandler(file_handler)
-    return logger
-class CollateFunction(object):
-    def __init__(self,
-                 n_fft: int = 512,
-                 win_length: int = 200,
-                 hop_length: int = 80,
-                 window_fn: str = "hamming",
-                 irm_beta: float = 1.0,
-                 epsilon: float = 1e-8,
-                 ):
-        self.n_fft = n_fft
-        self.win_length = win_length
-        self.hop_length = hop_length
-        self.window_fn = window_fn
-        self.irm_beta = irm_beta
-        self.epsilon = epsilon
-        self.transform = torchaudio.transforms.Spectrogram(
-            n_fft=self.n_fft,
-            win_length=self.win_length,
-            hop_length=self.hop_length,
-            power=2.0,
-            window_fn=torch.hamming_window if window_fn == "hamming" else torch.hann_window,
-        )
-    def __call__(self, batch: List[dict]):
-        mix_spec_list = list()
-        speech_irm_list = list()
-        snr_db_list = list()
-        for sample in batch:
-            noise_wave: torch.Tensor = sample["noise_wave"]
-            speech_wave: torch.Tensor = sample["speech_wave"]
-            mix_wave: torch.Tensor = sample["mix_wave"]
-            snr_db: float = sample["snr_db"]
-            noise_spec = self.transform.forward(noise_wave)
-            speech_spec = self.transform.forward(speech_wave)
-            mix_spec = self.transform.forward(mix_wave)
-            # noise_irm = noise_spec / (noise_spec + speech_spec)
-            speech_irm = speech_spec / (noise_spec + speech_spec + self.epsilon)
-            speech_irm = torch.pow(speech_irm, self.irm_beta)
-            mix_spec_list.append(mix_spec)
-            speech_irm_list.append(speech_irm)
-            snr_db_list.append(torch.tensor(snr_db, dtype=torch.float32))
-        mix_spec_list = torch.stack(mix_spec_list)
-        speech_irm_list = torch.stack(speech_irm_list)
-        snr_db_list = torch.stack(snr_db_list)  # shape: (batch_size,)
-        # assert
-        if torch.any(torch.isnan(mix_spec_list)):
-            raise AssertionError("nan in mix_spec Tensor")
-        if torch.any(torch.isnan(speech_irm_list)):
-            raise AssertionError("nan in speech_irm Tensor")
-        if torch.any(torch.isnan(snr_db_list)):
-            raise AssertionError("nan in snr_db Tensor")
-        return mix_spec_list, speech_irm_list, snr_db_list
-collate_fn = CollateFunction()
-def main():
-    args = get_args()
-    serialization_dir = Path(args.serialization_dir)
-    serialization_dir.mkdir(parents=True, exist_ok=True)
-    logger = logging_config(serialization_dir)
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    logger.info("set seed: {}".format(args.seed))
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    n_gpu = torch.cuda.device_count()
-    logger.info("GPU available count: {}; device: {}".format(n_gpu, device))
-    # datasets
-    logger.info("prepare datasets")
-    train_dataset = DenoiseExcelDataset(
-        excel_file=args.train_dataset,
-        expected_sample_rate=8000,
-        max_wave_value=32768.0,
-    )
-    valid_dataset = DenoiseExcelDataset(
-        excel_file=args.valid_dataset,
-        expected_sample_rate=8000,
-        max_wave_value=32768.0,
-    )
-    train_data_loader = DataLoader(
-        dataset=train_dataset,
-        batch_size=args.batch_size,
-        shuffle=True,
-        # Linux 系统中可以使用多个子进程加载数据, 而在 Windows 系统中不能.
-        num_workers=0 if platform.system() == "Windows" else os.cpu_count() // 2,
-        collate_fn=collate_fn,
-        pin_memory=False,
-        # prefetch_factor=64,
-    )
-    valid_data_loader = DataLoader(
-        dataset=valid_dataset,
-        batch_size=args.batch_size,
-        shuffle=True,
-        # Linux 系统中可以使用多个子进程加载数据, 而在 Windows 系统中不能.
-        num_workers=0 if platform.system() == "Windows" else os.cpu_count() // 2,
-        collate_fn=collate_fn,
-        pin_memory=False,
-        # prefetch_factor=64,
-    )
-    # models
-    logger.info(f"prepare models. config_file: {args.config_file}")
-    config = SimpleLstmIRMConfig.from_pretrained(
-        pretrained_model_name_or_path=args.config_file,
-        # num_labels=vocabulary.get_vocab_size(namespace="labels")
-    )
-    model = SimpleLstmIRMPretrainedModel(
-        config=config,
-    )
-    model.to(device)
-    model.train()
-    # optimizer
-    logger.info("prepare optimizer, lr_scheduler, loss_fn, categorical_accuracy")
-    param_optimizer = model.parameters()
-    optimizer = torch.optim.Adam(
-        param_optimizer,
-        lr=args.learning_rate,
-    )
-    # lr_scheduler = torch.optim.lr_scheduler.StepLR(
-    #     optimizer,
-    #     step_size=2000
-    # )
-    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
-        optimizer,
-        milestones=[10000, 20000, 30000, 40000, 50000], gamma=0.5
-    )
-    mse_loss = nn.MSELoss(
-        reduction="mean",
-    )
-    # training loop
-    logger.info("training")
-    training_loss = 10000000000
-    evaluation_loss = 10000000000
-    model_list = list()
-    best_idx_epoch = None
-    best_metric = None
-    patience_count = 0
-    for idx_epoch in range(args.max_epochs):
-        total_loss = 0.
-        total_examples = 0.
-        progress_bar = tqdm(
-            total=len(train_data_loader),
-            desc="Training; epoch: {}".format(idx_epoch),
-        )
-        for batch in train_data_loader:
-            mix_spec, speech_irm, snr_db = batch
-            mix_spec = mix_spec.to(device)
-            speech_irm_target = speech_irm.to(device)
-            snr_db_target = snr_db.to(device)
-            speech_irm_prediction = model.forward(mix_spec)
-            loss = mse_loss.forward(speech_irm_prediction, speech_irm_target)
-            total_loss += loss.item()
-            total_examples += mix_spec.size(0)
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-            lr_scheduler.step()
-            training_loss = total_loss / total_examples
-            training_loss = round(training_loss, 4)
-            progress_bar.update(1)
-            progress_bar.set_postfix({
-                "training_loss": training_loss,
-            })
-        total_loss = 0.
-        total_examples = 0.
-        progress_bar = tqdm(
-            total=len(valid_data_loader),
-            desc="Evaluation; epoch: {}".format(idx_epoch),
-        )
-        for batch in valid_data_loader:
-            mix_spec, speech_irm, snr_db = batch
-            mix_spec = mix_spec.to(device)
-            speech_irm_target = speech_irm.to(device)
-            snr_db_target = snr_db.to(device)
-            with torch.no_grad():
-                speech_irm_prediction = model.forward(mix_spec)
-                loss = mse_loss.forward(speech_irm_prediction, speech_irm_target)
-            total_loss += loss.item()
-            total_examples += mix_spec.size(0)
-            evaluation_loss = total_loss / total_examples
-            evaluation_loss = round(evaluation_loss, 4)
-            progress_bar.update(1)
-            progress_bar.set_postfix({
-                "evaluation_loss": evaluation_loss,
-            })
-        # save path
-        epoch_dir = serialization_dir / "epoch-{}".format(idx_epoch)
-        epoch_dir.mkdir(parents=True, exist_ok=False)
-        # save models
-        model.save_pretrained(epoch_dir.as_posix())
-        model_list.append(epoch_dir)
-        if len(model_list) >= args.num_serialized_models_to_keep:
-            model_to_delete: Path = model_list.pop(0)
-            shutil.rmtree(model_to_delete.as_posix())
-        # save metric
-        if best_metric is None:
-            best_idx_epoch = idx_epoch
-            best_metric = evaluation_loss
-        elif evaluation_loss < best_metric:
-            best_idx_epoch = idx_epoch
-            best_metric = evaluation_loss
-        else:
-            pass
-        metrics = {
-            "idx_epoch": idx_epoch,
-            "best_idx_epoch": best_idx_epoch,
-            "training_loss": training_loss,
-            "evaluation_loss": evaluation_loss,
-            "learning_rate": optimizer.param_groups[0]["lr"],
-        }
-        metrics_filename = epoch_dir / "metrics_epoch.json"
-        with open(metrics_filename, "w", encoding="utf-8") as f:
-            json.dump(metrics, f, indent=4, ensure_ascii=False)
-        # save best
-        best_dir = serialization_dir / "best"
-        if best_idx_epoch == idx_epoch:
-            if best_dir.exists():
-                shutil.rmtree(best_dir)
-            shutil.copytree(epoch_dir, best_dir)
-        # early stop
-        early_stop_flag = False
-        if best_idx_epoch == idx_epoch:
-            patience_count = 0
-        else:
-            patience_count += 1
-        if patience_count >= args.patience:
-            early_stop_flag = True
-        # early stop
-        if early_stop_flag:
-            break
-    return
-if __name__ == '__main__':
-    main()

toolbox/torchaudio/models/dfnet/configuration_dfnet.py CHANGED Viewed

@@ -14,6 +14,8 @@ class DfNetConfig(PretrainedConfig):
                  win_type: str = "hann",
                  spec_bins: int = 256,
                  conv_channels: int = 64,
                  conv_kernel_size_input: Tuple[int, int] = (3, 3),
@@ -79,6 +81,8 @@ class DfNetConfig(PretrainedConfig):
         # spectrum
         self.spec_bins = spec_bins
         # conv
         self.conv_channels = conv_channels

                  win_type: str = "hann",
                  spec_bins: int = 256,
+                 erb_bins: int = 32,
+                 min_freq_bins_for_erb: int = 2,
                  conv_channels: int = 64,
                  conv_kernel_size_input: Tuple[int, int] = (3, 3),
         # spectrum
         self.spec_bins = spec_bins
+        self.erb_bins = erb_bins
+        self.min_freq_bins_for_erb = min_freq_bins_for_erb
         # conv
         self.conv_channels = conv_channels

toolbox/torchaudio/models/dfnet/conv_stft.py DELETED Viewed

@@ -1,148 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-"""
-https://github.com/modelscope/modelscope/blob/master/modelscope/models/audio/ans/conv_stft.py
-"""
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from scipy.signal import get_window
-def init_kernels(nfft: int, win_size: int, hop_size: int, win_type: str = None, inverse=False):
-    if win_type == "None" or win_type is None:
-        window = np.ones(win_size)
-    else:
-        window = get_window(win_type, win_size, fftbins=True)**0.5
-    fourier_basis = np.fft.rfft(np.eye(nfft))[:win_size]
-    real_kernel = np.real(fourier_basis)
-    image_kernel = np.imag(fourier_basis)
-    kernel = np.concatenate([real_kernel, image_kernel], 1).T
-    if inverse:
-        kernel = np.linalg.pinv(kernel).T
-    kernel = kernel * window
-    kernel = kernel[:, None, :]
-    result = (
-        torch.from_numpy(kernel.astype(np.float32)),
-        torch.from_numpy(window[None, :, None].astype(np.float32))
-    )
-    return result
-class ConvSTFT(nn.Module):
-    def __init__(self,
-                 nfft: int,
-                 win_size: int,
-                 hop_size: int,
-                 win_type: str = "hamming",
-                 power: int = None,
-                 requires_grad: bool = False):
-        super(ConvSTFT, self).__init__()
-        if nfft is None:
-            self.nfft = int(2**np.ceil(np.log2(win_size)))
-        else:
-            self.nfft = nfft
-        kernel, _ = init_kernels(self.nfft, win_size, hop_size, win_type)
-        self.weight = nn.Parameter(kernel, requires_grad=requires_grad)
-        self.win_size = win_size
-        self.hop_size = hop_size
-        self.stride = hop_size
-        self.dim = self.nfft
-        self.power = power
-    def forward(self, inputs: torch.Tensor):
-        if inputs.dim() == 2:
-            inputs = torch.unsqueeze(inputs, 1)
-        matrix = F.conv1d(inputs, self.weight, stride=self.stride)
-        dim = self.dim // 2 + 1
-        real = matrix[:, :dim, :]
-        imag = matrix[:, dim:, :]
-        spec = torch.complex(real, imag)
-        if self.power is None:
-            return spec
-        elif self.power == 1:
-            mags = torch.sqrt(real**2 + imag**2)
-            # phase = torch.atan2(imag, real)
-            return mags
-        elif self.power == 2:
-            power = real**2 + imag**2
-            return power
-        else:
-            raise AssertionError
-class ConviSTFT(nn.Module):
-    def __init__(self,
-                 win_size: int,
-                 hop_size: int,
-                 nfft: int = None,
-                 win_type: str = "hamming",
-                 requires_grad: bool = False):
-        super(ConviSTFT, self).__init__()
-        if nfft is None:
-            self.nfft = int(2**np.ceil(np.log2(win_size)))
-        else:
-            self.nfft = nfft
-        kernel, window = init_kernels(self.nfft, win_size, hop_size, win_type, inverse=True)
-        self.weight = nn.Parameter(kernel, requires_grad=requires_grad)
-        self.win_size = win_size
-        self.hop_size = hop_size
-        self.win_type = win_type
-        self.stride = hop_size
-        self.dim = self.nfft
-        self.register_buffer("window", window)
-        self.register_buffer("enframe", torch.eye(win_size)[:, None, :])
-    def forward(self,
-                inputs: torch.Tensor):
-        """
-        :param inputs: torch.Tensor, shape: [b, f, t]
-        :return:
-        """
-        inputs = torch.view_as_real(inputs)
-        matrix = torch.concat(tensors=[inputs[..., 0], inputs[..., 1]], dim=1)
-        waveform = F.conv_transpose1d(matrix, self.weight, stride=self.stride)
-        # this is from torch-stft: https://github.com/pseeth/torch-stft
-        t = self.window.repeat(1, 1, matrix.size(-1))**2
-        coff = F.conv_transpose1d(t, self.enframe, stride=self.stride)
-        waveform = waveform / (coff + 1e-8)
-        return waveform
-def main():
-    stft = ConvSTFT(nfft=512, win_size=512, hop_size=200, power=None)
-    istft = ConviSTFT(nfft=512, win_size=512, hop_size=200)
-    mixture = torch.rand(size=(1, 8000*40), dtype=torch.float32)
-    spec = stft.forward(mixture)
-    # shape: [batch_size, freq_bins, time_steps]
-    print(f"spec.shape: {spec.shape}, spec.dtype: {spec.dtype}")
-    waveform = istft.forward(spec)
-    # shape: [batch_size, channels, num_samples]
-    print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
-    return
-if __name__ == "__main__":
-    main()

toolbox/torchaudio/models/dfnet/modeling_dfnet.py CHANGED Viewed

@@ -12,8 +12,9 @@ import torchaudio
 from toolbox.torchaudio.configuration_utils import CONFIG_FILE
 from toolbox.torchaudio.models.dfnet.configuration_dfnet import DfNetConfig
-from toolbox.torchaudio.models.dfnet.conv_stft import ConvSTFT, ConviSTFT
 from toolbox.torchaudio.modules.local_snr_target import LocalSnrTarget
 MODEL_FILE = "model.pt"
@@ -225,7 +226,8 @@ class GroupedLinear(nn.Module):
         # The better way, but not supported by torchscript
         # x = x.unflatten(-1, (self.groups, self.ws))  # [..., G, I/G]
         x = torch.einsum("btgi,gih->btgh", x, self.weight)  # [..., G, H/G]
-        x = x.flatten(2, 3)  # [B, T, H]
         return x
     def __repr__(self):
@@ -302,7 +304,8 @@ class SqueezedGRU_S(nn.Module):
             self.linear_out = nn.Identity()
     def forward(self, inputs: torch.Tensor, h=None) -> Tuple[torch.Tensor, torch.Tensor]:
-        x = self.linear_in(inputs)
         x, h = self.gru.forward(x, h)
@@ -327,8 +330,8 @@ class Concat(nn.Module):
 class Encoder(nn.Module):
     def __init__(self, config: DfNetConfig):
         super(Encoder, self).__init__()
-        self.embedding_input_size = config.conv_channels * config.spec_bins // 4
-        self.embedding_output_size = config.conv_channels * config.spec_bins // 4
         self.embedding_hidden_size = config.embedding_hidden_size
         self.spec_conv0 = CausalConv2d(
@@ -423,49 +426,55 @@ class Encoder(nn.Module):
         self.lsnr_offset = config.min_local_snr
     def forward(self,
-                feat_power: torch.Tensor,
                 feat_spec: torch.Tensor,
                 hidden_state: torch.Tensor = None,
                 ):
-        # feat_power shape: (batch_size, 1, time_steps, spec_dim)
-        e0 = self.spec_conv0.forward(feat_power)
         e1 = self.spec_conv1.forward(e0)
         e2 = self.spec_conv2.forward(e1)
         e3 = self.spec_conv3.forward(e2)
-        # e0 shape: [batch_size, channels, time_steps, spec_dim]
-        # e1 shape: [batch_size, channels, time_steps, spec_dim // 2]
-        # e2 shape: [batch_size, channels, time_steps, spec_dim // 4]
-        # e3 shape: [batch_size, channels, time_steps, spec_dim // 4]
-        # feat_spec, shape: (batch_size, 2, time_steps, df_bins)
         c0 = self.df_conv0(feat_spec)
         c1 = self.df_conv1(c0)
-        # c0 shape: [batch_size, channels, time_steps, df_bins]
-        # c1 shape: [batch_size, channels, time_steps, df_bins // 2]
         cemb = c1.permute(0, 2, 3, 1)
-        # cemb shape: [batch_size, time_steps, df_bins // 2, channels]
         cemb = cemb.flatten(2)
-        # cemb shape: [batch_size, time_steps, df_bins // 2 * channels]
-        cemb = self.df_fc_emb(cemb)
-        # cemb shape: [batch_size, time_steps, spec_dim // 4 * channels]
-        # e3 shape: [batch_size, channels, time_steps, spec_dim // 4]
         emb = e3.permute(0, 2, 3, 1)
-        # emb shape: [batch_size, time_steps, spec_dim // 4, channels]
         emb = emb.flatten(2)
-        # emb shape: [batch_size, time_steps, spec_dim // 4 * channels]
         emb = self.combine(emb, cemb)
-        # if concat; emb shape: [batch_size, time_steps, spec_dim // 4 * channels * 2]
-        # if add; emb shape: [batch_size, time_steps, spec_dim // 4 * channels]
         emb, h = self.emb_gru.forward(emb, hidden_state)
-        # emb shape: [batch_size, time_steps, spec_dim // 4 * channels]
-        # h shape: [batch_size, 1, spec_dim]
         lsnr = self.lsnr_fc(emb) * self.lsnr_scale + self.lsnr_offset
-        # lsnr shape: [batch_size, time_steps, 1]
         return e0, e1, e2, e3, emb, c0, lsnr, h
@@ -477,8 +486,8 @@ class Decoder(nn.Module):
         if config.spec_bins % 8 != 0:
             raise AssertionError("spec_bins should be divisible by 8")
-        self.emb_in_dim = config.conv_channels * config.spec_bins // 4
-        self.emb_out_dim = config.conv_channels * config.spec_bins // 4
         self.emb_hidden_dim = config.decoder_emb_hidden_size
         self.emb_gru = SqueezedGRU_S(
@@ -570,7 +579,7 @@ class Decoder(nn.Module):
         b, _, t, f8 = e3.shape
         # emb shape: [batch_size, time_steps, (freq_dim // 4) * conv_channels]
-        emb, _ = self.emb_gru(emb)
         # emb shape: [batch_size, conv_channels, time_steps, freq_dim // 4]
         emb = emb.view(b, t, f8, -1).permute(0, 3, 1, 2)
         e3 = self.convt3(self.conv3p(e3) + emb)
@@ -588,7 +597,7 @@ class DfDecoder(nn.Module):
     def __init__(self, config: DfNetConfig):
         super(DfDecoder, self).__init__()
-        self.embedding_input_size = config.conv_channels * config.spec_bins // 4
         self.df_decoder_hidden_size = config.df_decoder_hidden_size
         self.df_num_layers = config.df_num_layers
@@ -712,14 +721,14 @@ class Mask(nn.Module):
         return mask_pf
     def forward(self, spec: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
-        # spec shape: [batch_size, 1, time_steps, spec_bins, 2]
         if not self.training and self.use_post_filter:
             mask = self.post_filter(mask)
-        # mask shape: [batch_size, 1, time_steps, spec_bins]
         mask = mask.unsqueeze(4)
-        # mask shape: [batch_size, 1, time_steps, spec_bins, 1]
         return spec * mask
@@ -803,6 +812,13 @@ class DfNet(nn.Module):
         self.hop_size = config.hop_size
         self.win_type = config.win_type
         self.stft = ConvSTFT(
             nfft=config.nfft,
             win_size=config.win_size,
@@ -867,37 +883,42 @@ class DfNet(nn.Module):
         noisy, n_samples = self.signal_prepare(noisy)
         # noisy shape: [b, num_samples_pad]
-        cmp_spec = self.stft.forward(noisy)
-        # cmp_spec shape: [b, f, t], torch.complex64
-        cmp_spec = torch.view_as_real(cmp_spec)
-        # cmp_spec shape: [b, f, t, 2]
-        cmp_spec = cmp_spec.permute(0, 3, 1, 2)
-        # cmp_spec shape: [b, 2, f, t]
-        cmp_spec = cmp_spec[:, :, :-1, :]
-        # cmp_spec shape: [b, 2, spec_bins, t]
-        # n//2+1 -> n//2; 257 -> 256
-        spec = torch.unsqueeze(cmp_spec, dim=4)
-        # spec shape: [b, 2, spec_bins, t, 1]
-        spec = spec.permute(0, 4, 3, 2, 1)
-        # spec shape: [b, 1, t, spec_bins, 2]
-        feat_power = torch.sum(torch.square(spec), dim=-1)
-        # feat_power shape: [b, 1, t, spec_bins]
-        feat_spec = torch.transpose(cmp_spec, dim0=2, dim1=3)
-        # feat_spec shape: [b, 2, t, spec_bins]
         feat_spec = feat_spec[..., :self.df_decoder.df_bins]
         # feat_spec shape: [b, 2, t, df_bins]
-        e0, e1, e2, e3, emb, c0, lsnr, h = self.encoder.forward(feat_power, feat_spec)
         mask = self.decoder.forward(emb, e3, e2, e1, e0)
-        # mask shape: [b, 1, t, spec_bins]
         if torch.any(mask > 1) or torch.any(mask < 0):
             raise AssertionError
         spec_m = self.mask.forward(spec, mask)
         # lsnr shape: [b, t, 1]
         lsnr = torch.transpose(lsnr, dim0=2, dim1=1)
@@ -907,8 +928,10 @@ class DfNet(nn.Module):
         df_coefs = self.df_out_transform(df_coefs)
         # df_coefs shape: [b, df_order, t, df_bins, 2]
-        spec_e = self.df_op.forward(spec.clone(), df_coefs)
-        # est_spec shape: [b, 1, t, spec_bins, 2]
         spec_e[..., self.df_decoder.df_bins:, :] = spec_m[..., self.df_decoder.df_bins:, :]
@@ -916,14 +939,10 @@ class DfNet(nn.Module):
         spec_e = spec_e.permute(0, 2, 1, 3)
         # spec_e shape: [b, spec_bins, t, 2]
-        mask = torch.squeeze(mask, dim=1)
-        mask = mask.permute(0, 2, 1)
-        # mask shape: [b, spec_bins, t]
-        est_mask = self.mask_transfer(mask)
-        # est_mask shape: [b, f, t]
         # spec_e shape: [b, spec_bins, t, 2]
-        est_spec = self.spec_transfer(spec_e)
         # est_spec shape: [b, f, t], torch.complex64
         est_wav = self.istft.forward(est_spec)
@@ -931,33 +950,11 @@ class DfNet(nn.Module):
         est_wav = est_wav[:, :n_samples]
         # est_wav shape: [b, n_samples]
-        return est_spec, est_wav, est_mask, lsnr
-    def spec_transfer(self, spec_e: torch.Tensor) -> torch.Tensor:
-        # spec_e shape: [b, spec_bins, t, 2]
-        b, _, t, _ = spec_e.shape
-        est_spec = torch.complex(
-            real=torch.concat(tensors=[
-                spec_e[..., 0],
-                torch.zeros(size=(b, 1, t), dtype=spec_e.dtype).to(spec_e.device)
-            ], dim=1),
-            imag=torch.concat(tensors=[
-                spec_e[..., 1],
-                torch.zeros(size=(b, 1, t), dtype=spec_e.dtype).to(spec_e.device)
-            ], dim=1),
-        )
-        # est_spec shape: [b, f, t]
-        return est_spec
-    def mask_transfer(self, mask: torch.Tensor) -> torch.Tensor:
-        # mask shape: [b, 256, t]
-        b, _, t = mask.shape
-        est_mask = torch.concat(tensors=[
-            mask,
-            torch.zeros(size=(b, 1, t), dtype=mask.dtype).to(mask.device)
-        ], dim=1)
-        # est_mask shape: [b, 257, t]
-        return est_mask
     def mask_loss_fn(self, est_mask: torch.Tensor, clean: torch.Tensor, noisy: torch.Tensor):
         """

 from toolbox.torchaudio.configuration_utils import CONFIG_FILE
 from toolbox.torchaudio.models.dfnet.configuration_dfnet import DfNetConfig
+from toolbox.torchaudio.modules.conv_stft import ConvSTFT, ConviSTFT
 from toolbox.torchaudio.modules.local_snr_target import LocalSnrTarget
+from toolbox.torchaudio.modules.freq_bands.erb_bands import ErbBands
 MODEL_FILE = "model.pt"
         # The better way, but not supported by torchscript
         # x = x.unflatten(-1, (self.groups, self.ws))  # [..., G, I/G]
         x = torch.einsum("btgi,gih->btgh", x, self.weight)  # [..., G, H/G]
+        x = x.flatten(2, 3)
+        # x: [b, t, h]
         return x
     def __repr__(self):
             self.linear_out = nn.Identity()
     def forward(self, inputs: torch.Tensor, h=None) -> Tuple[torch.Tensor, torch.Tensor]:
+        # inputs: shape: [b, t, h]
+        x = self.linear_in.forward(inputs)
         x, h = self.gru.forward(x, h)
 class Encoder(nn.Module):
     def __init__(self, config: DfNetConfig):
         super(Encoder, self).__init__()
+        self.embedding_input_size = config.conv_channels * config.erb_bins // 4
+        self.embedding_output_size = config.conv_channels * config.erb_bins // 4
         self.embedding_hidden_size = config.embedding_hidden_size
         self.spec_conv0 = CausalConv2d(
         self.lsnr_offset = config.min_local_snr
     def forward(self,
+                feat_erb: torch.Tensor,
                 feat_spec: torch.Tensor,
                 hidden_state: torch.Tensor = None,
                 ):
+        # feat_erb shape: (b, 1, t, erb_bins)
+        e0 = self.spec_conv0.forward(feat_erb)
         e1 = self.spec_conv1.forward(e0)
         e2 = self.spec_conv2.forward(e1)
         e3 = self.spec_conv3.forward(e2)
+        # e0 shape: [b, c, t, erb_bins]
+        # e1 shape: [b, c, t, erb_bins // 2]
+        # e2 shape: [b, c, t, erb_bins // 4]
+        # e3 shape: [b, c, t, erb_bins // 4]
+        # e3 shape: [b, 64, t, 32/4=8]
+        # feat_spec, shape: (b, 2, t, df_bins)
         c0 = self.df_conv0(feat_spec)
         c1 = self.df_conv1(c0)
+        # c0 shape: [b, c, t, df_bins]
+        # c1 shape: [b, c, t, df_bins // 2]
+        # c1 shape: [b, 64, t, 96/2=48]
         cemb = c1.permute(0, 2, 3, 1)
+        # cemb shape: [b, t, df_bins // 2, c]
         cemb = cemb.flatten(2)
+        # cemb shape: [b, t, df_bins // 2 * c]
+        # cemb shape: [b, t, 96/2*64=3072]
+        cemb = self.df_fc_emb.forward(cemb)
+        # cemb shape: [b, t, erb_bins // 4 * c]
+        # cemb shape: [b, t, 32/4*64=512]
+        # e3 shape: [b, c, t, erb_bins // 4]
         emb = e3.permute(0, 2, 3, 1)
+        # emb shape: [b, t, erb_bins // 4, c]
         emb = emb.flatten(2)
+        # emb shape: [b, t, erb_bins // 4 * c]
+        # emb shape: [b, t, 32/4*64=512]
         emb = self.combine(emb, cemb)
+        # if concat; emb shape: [b, t, spec_bins // 4 * c * 2]
+        # if add; emb shape: [b, t, spec_bins // 4 * c]
         emb, h = self.emb_gru.forward(emb, hidden_state)
+        # emb shape: [b, t, spec_dim // 4 * c]
+        # h shape: [b, 1, spec_dim]
         lsnr = self.lsnr_fc(emb) * self.lsnr_scale + self.lsnr_offset
+        # lsnr shape: [b, t, 1]
         return e0, e1, e2, e3, emb, c0, lsnr, h
         if config.spec_bins % 8 != 0:
             raise AssertionError("spec_bins should be divisible by 8")
+        self.emb_in_dim = config.conv_channels * config.erb_bins // 4
+        self.emb_out_dim = config.conv_channels * config.erb_bins // 4
         self.emb_hidden_dim = config.decoder_emb_hidden_size
         self.emb_gru = SqueezedGRU_S(
         b, _, t, f8 = e3.shape
         # emb shape: [batch_size, time_steps, (freq_dim // 4) * conv_channels]
+        emb, _ = self.emb_gru.forward(emb)
         # emb shape: [batch_size, conv_channels, time_steps, freq_dim // 4]
         emb = emb.view(b, t, f8, -1).permute(0, 3, 1, 2)
         e3 = self.convt3(self.conv3p(e3) + emb)
     def __init__(self, config: DfNetConfig):
         super(DfDecoder, self).__init__()
+        self.embedding_input_size = config.conv_channels * config.erb_bins // 4
         self.df_decoder_hidden_size = config.df_decoder_hidden_size
         self.df_num_layers = config.df_num_layers
         return mask_pf
     def forward(self, spec: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        # spec shape: [b, 1, t, spec_bins, 2]
         if not self.training and self.use_post_filter:
             mask = self.post_filter(mask)
+        # mask shape: [b, 1, t, spec_bins]
         mask = mask.unsqueeze(4)
+        # mask shape: [b, 1, t, spec_bins, 1]
         return spec * mask
         self.hop_size = config.hop_size
         self.win_type = config.win_type
+        self.erb_bands = ErbBands(
+            sample_rate=config.sample_rate,
+            nfft=config.nfft,
+            erb_bins=config.erb_bins,
+            min_freq_bins_for_erb=config.min_freq_bins_for_erb,
+        )
         self.stft = ConvSTFT(
             nfft=config.nfft,
             win_size=config.win_size,
         noisy, n_samples = self.signal_prepare(noisy)
         # noisy shape: [b, num_samples_pad]
+        spec_cmp = self.stft.forward(noisy)
+        # spec_complex shape: [b, f, t], torch.complex64
+        spec_cmp = torch.transpose(spec_cmp, dim0=1, dim1=2)
+        # spec_complex shape: [b, t, f], torch.complex64
+        spec_cmp_real = torch.view_as_real(spec_cmp)
+        # spec_cmp_real shape: [b, t, f, 2]
+        spec_mag = torch.abs(spec_cmp)
+        spec_pow = torch.square(spec_mag)
+        # shape: [b, t, f]
+        spec = torch.unsqueeze(spec_cmp_real, dim=1)
+        # spec shape: [b, 1, t, f, 2]
+        feat_erb = self.erb_bands.erb_scale(spec_pow, db=True)
+        # feat_erb shape: [b, t, erb_bins]
+        feat_erb = torch.unsqueeze(feat_erb, dim=1)
+        # feat_erb shape: [b, 1, t, erb_bins]
+        feat_spec = spec_cmp_real.permute(0, 3, 1, 2)
+        # feat_spec shape: [b, 2, t, f]
         feat_spec = feat_spec[..., :self.df_decoder.df_bins]
         # feat_spec shape: [b, 2, t, df_bins]
+        e0, e1, e2, e3, emb, c0, lsnr, h = self.encoder.forward(feat_erb, feat_spec)
         mask = self.decoder.forward(emb, e3, e2, e1, e0)
+        # mask shape: [b, 1, t, erb_bins]
+        mask = self.erb_bands.erb_scale_inv(mask)
+        # mask shape: [b, 1, t, f]
         if torch.any(mask > 1) or torch.any(mask < 0):
             raise AssertionError
         spec_m = self.mask.forward(spec, mask)
+        # spec_m shape: [b, 1, t, f, 2]
+        spec_m = spec_m[:, :, :, :self.config.spec_bins, :]
+        # spec_m shape: [b, 1, t, spec_bins, 2]
         # lsnr shape: [b, t, 1]
         lsnr = torch.transpose(lsnr, dim0=2, dim1=1)
         df_coefs = self.df_out_transform(df_coefs)
         # df_coefs shape: [b, df_order, t, df_bins, 2]
+        spec_ = spec[:, :, :, :self.config.spec_bins, :]
+        # spec shape: [b, 1, t, spec_bins, 2]
+        spec_e = self.df_op.forward(spec_, df_coefs)
+        # spec_e shape: [b, 1, t, spec_bins, 2]
         spec_e[..., self.df_decoder.df_bins:, :] = spec_m[..., self.df_decoder.df_bins:, :]
         spec_e = spec_e.permute(0, 2, 1, 3)
         # spec_e shape: [b, spec_bins, t, 2]
         # spec_e shape: [b, spec_bins, t, 2]
+        est_spec = torch.complex(real=spec_e[..., 0], imag=spec_e[..., 1])
+        # est_spec shape: [b, spec_bins, t], torch.complex64
+        est_spec = torch.concat(tensors=[est_spec, est_spec[:, -1:, :]], dim=1)
         # est_spec shape: [b, f, t], torch.complex64
         est_wav = self.istft.forward(est_spec)
         est_wav = est_wav[:, :n_samples]
         # est_wav shape: [b, n_samples]
+        est_mask = torch.squeeze(mask, dim=1)
+        est_mask = est_mask.permute(0, 2, 1)
+        # est_mask shape: [b, f, t]
+        return est_spec, est_wav, est_mask, lsnr
     def mask_loss_fn(self, est_mask: torch.Tensor, clean: torch.Tensor, noisy: torch.Tensor):
         """

toolbox/torchaudio/models/frcrn/conv_stft.py CHANGED Viewed

@@ -127,8 +127,8 @@ class ConviSTFT(nn.Module):
 def main():
-    stft = ConvSTFT(win_size=512, hop_size=200, feature_type="complex")
-    istft = ConviSTFT(win_size=512, hop_size=200, feature_type="complex")
     mixture = torch.rand(size=(1, 8000*40), dtype=torch.float32)

 def main():
+    stft = ConvSTFT(nfft=512, win_size=512, hop_size=200, feature_type="complex")
+    istft = ConviSTFT(nfft=512, win_size=512, hop_size=200, feature_type="complex")
     mixture = torch.rand(size=(1, 8000*40), dtype=torch.float32)

toolbox/torchaudio/models/{simple_lstm_irm → lstm}/__init__.py RENAMED Viewed

File without changes

toolbox/torchaudio/models/lstm/configuration_lstm.py ADDED Viewed

	@@ -0,0 +1,73 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from toolbox.torchaudio.configuration_utils import PretrainedConfig
+class LstmConfig(PretrainedConfig):
+    def __init__(self,
+                 sample_rate: int = 8000,
+                 segment_size: int = 32000,
+                 nfft: int = 512,
+                 win_size: int = 512,
+                 hop_size: int = 256,
+                 win_type: str = "hann",
+                 hidden_size: int = 1024,
+                 num_layers: int = 2,
+                 dropout: float = 0.2,
+                 min_snr_db: float = -10,
+                 max_snr_db: float = 20,
+                 max_epochs: int = 100,
+                 batch_size: int = 4,
+                 num_workers: int = 4,
+                 seed: int = 1234,
+                 lr: float = 0.001,
+                 lr_scheduler: str = "CosineAnnealingLR",
+                 lr_scheduler_kwargs: dict = None,
+                 weight_decay: float = 0.00001,
+                 clip_grad_norm: float = 10.,
+                 eval_steps: int = 25000,
+                 **kwargs
+                 ):
+        super(LstmConfig, self).__init__(**kwargs)
+        self.sample_rate = sample_rate
+        self.segment_size = segment_size
+        self.nfft = nfft
+        self.win_size = win_size
+        self.hop_size = hop_size
+        self.win_type = win_type
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.dropout = dropout
+        self.min_snr_db = min_snr_db
+        self.max_snr_db = max_snr_db
+        self.max_epochs = max_epochs
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.seed = seed
+        self.lr = lr
+        self.lr_scheduler = lr_scheduler
+        self.lr_scheduler_kwargs = lr_scheduler_kwargs or dict()
+        self.weight_decay = weight_decay
+        self.clip_grad_norm = clip_grad_norm
+        self.eval_steps = eval_steps
+def main():
+    config = LstmConfig()
+    config.to_yaml_file("config.yaml")
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/models/lstm/modeling_lstm.py ADDED Viewed

	@@ -0,0 +1,260 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+https://github.com/haoxiangsnr/IRM-based-Speech-Enhancement-using-LSTM/blob/master/model/lstm_model.py
+"""
+import os
+from typing import Optional, Union, Tuple
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import torchaudio
+from toolbox.torchaudio.models.lstm.configuration_lstm import LstmConfig
+from toolbox.torchaudio.configuration_utils import CONFIG_FILE
+from toolbox.torchaudio.modules.conv_stft import ConvSTFT, ConviSTFT
+MODEL_FILE = "model.pt"
+class Transpose(nn.Module):
+    def __init__(self, dim0: int, dim1: int):
+        super(Transpose, self).__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, inputs: torch.Tensor):
+        inputs = torch.transpose(inputs, dim0=self.dim0, dim1=self.dim1)
+        return inputs
+class LstmModel(nn.Module):
+    def __init__(self,
+                 nfft: int = 512,
+                 win_size: int = 512,
+                 hop_size: int = 256,
+                 win_type: str = "hann",
+                 hidden_size=1024,
+                 num_layers: int = 2,
+                 batch_first: bool = True,
+                 dropout: float = 0.2,
+                 ):
+        super(LstmModel, self).__init__()
+        self.nfft = nfft
+        self.win_size = win_size
+        self.hop_size = hop_size
+        self.win_type = win_type
+        self.spec_bins = nfft // 2 + 1
+        self.hidden_size = hidden_size
+        self.eps = 1e-8
+        self.stft = ConvSTFT(
+            nfft=self.nfft,
+            win_size=self.win_size,
+            hop_size=self.hop_size,
+            win_type=self.win_type,
+            power=None,
+            requires_grad=False
+        )
+        self.istft = ConviSTFT(
+            nfft=self.nfft,
+            win_size=self.win_size,
+            hop_size=self.hop_size,
+            win_type=self.win_type,
+            requires_grad=False
+        )
+        self.lstm = nn.LSTM(input_size=self.spec_bins,
+                            hidden_size=hidden_size,
+                            num_layers=num_layers,
+                            batch_first=batch_first,
+                            dropout=dropout,
+                            )
+        self.linear = nn.Linear(in_features=hidden_size, out_features=self.spec_bins)
+        self.activation = nn.Sigmoid()
+    def signal_prepare(self, signal: torch.Tensor) -> torch.Tensor:
+        if signal.dim() == 2:
+            signal = torch.unsqueeze(signal, dim=1)
+        _, _, n_samples = signal.shape
+        remainder = (n_samples - self.win_size) % self.hop_size
+        if remainder > 0:
+            n_samples_pad = self.hop_size - remainder
+            signal = F.pad(signal, pad=(0, n_samples_pad), mode="constant", value=0)
+        return signal, n_samples
+    def forward(self,
+                noisy: torch.Tensor,
+                h_state: Tuple[torch.Tensor, torch.Tensor] = None,
+                ):
+        noisy, num_samples = self.signal_prepare(noisy)
+        batch_size, _, num_samples_pad = noisy.shape
+        # print(f"num_samples: {num_samples}, num_samples_pad: {num_samples_pad}")
+        mag_noisy, pha_noisy = self.mag_pha_stft(noisy)
+        # shape: (b, f, t)
+        # t = (num_samples - win_size) / hop_size + 1
+        mask, h_state = self.forward_chunk(mag_noisy, h_state)
+        # mask shape: (b, f, t)
+        stft_denoise = self.do_mask(mag_noisy, pha_noisy, mask)
+        denoise = self.istft.forward(stft_denoise)
+        # denoise shape: [b, 1, num_samples_pad]
+        denoise = denoise[:, :, :num_samples]
+        # denoise shape: [b, 1, num_samples]
+        return denoise, mask, h_state
+    def mag_pha_stft(self, noisy: torch.Tensor):
+        # noisy shape: [b, num_samples]
+        stft_noisy = self.stft.forward(noisy)
+        # stft_noisy shape: [b, f, t], torch.complex64
+        real = torch.real(stft_noisy)
+        imag = torch.imag(stft_noisy)
+        mag_noisy = torch.sqrt(real ** 2 + imag ** 2)
+        pha_noisy = torch.atan2(imag, real)
+        # shape: (b, f, t)
+        return mag_noisy, pha_noisy
+    def forward_chunk(self,
+                      mag_noisy: torch.Tensor,
+                      h_state: Tuple[torch.Tensor, torch.Tensor] = None,
+                      ):
+        # mag_noisy shape: (b, f, t)
+        x = torch.transpose(mag_noisy, dim0=2, dim1=1)
+        # x shape: (b, t, f)
+        x, h_state = self.lstm.forward(x, hx=h_state)
+        x = self.linear.forward(x)
+        mask = self.activation(x)
+        # mask shape: (b, t, f)
+        mask = torch.transpose(mask, dim0=2, dim1=1)
+        # mask shape: (b, f, t)
+        return mask, h_state
+    def do_mask(self,
+                mag_noisy: torch.Tensor,
+                pha_noisy: torch.Tensor,
+                mask: torch.Tensor,
+                ):
+        # (b, f, t)
+        mag_denoise = mag_noisy * mask
+        stft_denoise = mag_denoise * torch.exp((1j * pha_noisy))
+        return stft_denoise
+class LstmPretrainedModel(LstmModel):
+    def __init__(self,
+                 config: LstmConfig,
+                 ):
+        super(LstmPretrainedModel, self).__init__(
+            nfft=config.nfft,
+            win_size=config.win_size,
+            hop_size=config.hop_size,
+            win_type=config.win_type,
+            hidden_size=config.hidden_size,
+            num_layers=config.num_layers,
+            dropout=config.dropout,
+        )
+        self.config = config
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        config = LstmConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        model = cls(config)
+        if os.path.isdir(pretrained_model_name_or_path):
+            ckpt_file = os.path.join(pretrained_model_name_or_path, MODEL_FILE)
+        else:
+            ckpt_file = pretrained_model_name_or_path
+        with open(ckpt_file, "rb") as f:
+            state_dict = torch.load(f, map_location="cpu", weights_only=True)
+        model.load_state_dict(state_dict, strict=True)
+        return model
+    def save_pretrained(self,
+                        save_directory: Union[str, os.PathLike],
+                        state_dict: Optional[dict] = None,
+                        ):
+        model = self
+        if state_dict is None:
+            state_dict = model.state_dict()
+        os.makedirs(save_directory, exist_ok=True)
+        # save state dict
+        model_file = os.path.join(save_directory, MODEL_FILE)
+        torch.save(state_dict, model_file)
+        # save config
+        config_file = os.path.join(save_directory, CONFIG_FILE)
+        self.config.to_yaml_file(config_file)
+        return save_directory
+def main():
+    config = LstmConfig()
+    model = LstmPretrainedModel(config)
+    model.eval()
+    noisy = torch.randn(size=(1, 16000), dtype=torch.float32)
+    noisy, _ = model.signal_prepare(noisy)
+    b, _, num_samples = noisy.shape
+    t = (num_samples - config.win_size) / config.hop_size + 1
+    waveform, mask, h_state = model.forward(noisy)
+    print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
+    print(waveform[:, :, 300: 302])
+    # noisy_pad shape: [b, 1, num_samples_pad]
+    h_state = None
+    sub_spec_list = list()
+    for i in range(int(t)):
+        begin = i * config.hop_size
+        end = begin + config.win_size
+        sub_noisy = noisy[:, :, begin:end]
+        mag_noisy, pha_noisy = model.mag_pha_stft(sub_noisy)
+        mask, h_state = model.forward_chunk(mag_noisy, h_state)
+        sub_spec = model.do_mask(mag_noisy, pha_noisy, mask)
+        sub_spec_list.append(sub_spec)
+    spec = torch.concat(sub_spec_list, dim=2)
+    # 1
+    waveform = model.istft.forward(spec)
+    waveform = waveform[:, :, :num_samples]
+    print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
+    print(waveform[:, :, 300: 302])
+    # 2
+    waveform_cache = None
+    coff_cache = None
+    waveform = torch.zeros(size=(b, 1, num_samples), dtype=torch.float32)
+    for i in range(int(t)):
+        sub_spec = spec[:, :, i:i+1]
+        begin = i * config.hop_size
+        end = begin + config.win_size - config.hop_size
+        sub_waveform, waveform_cache, coff_cache = model.istft.forward_chunk(sub_spec, waveform_cache, coff_cache)
+        # end = begin + config.win_size
+        # sub_waveform = model.istft.forward(sub_spec)
+        # (b, 1, win_size)
+        waveform[:, :, begin:end] = sub_waveform
+    print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
+    print(waveform[:, :, 300: 302])
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/models/lstm/yaml/config.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+model_name: "lstm"
+# spec
+sample_rate: 8000
+segment_size: 32000
+n_fft: 320
+win_size: 320
+hop_size: 160
+win_type: hann
+# data
+max_snr_db: 20
+min_snr_db: -10
+# model
+hidden_size: 512
+num_layers: 3
+dropout: 0.1
+# train
+max_epochs: 100
+batch_size: 32
+num_workers: 4
+seed: 1234
+lr: 0.001
+lr_scheduler: CosineAnnealingLR
+lr_scheduler_kwargs: {}
+weight_decay: 0.00001
+clip_grad_norm: 10.0
+eval_steps: 25000

toolbox/torchaudio/models/simple_lstm_irm/configuration_simple_lstm_irm.py DELETED Viewed

@@ -1,38 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-from toolbox.torchaudio.configuration_utils import PretrainedConfig
-class SimpleLstmIRMConfig(PretrainedConfig):
-    def __init__(self,
-                 sample_rate: int,
-                 n_fft: int,
-                 win_length: int,
-                 hop_length: int,
-                 num_bins: int,
-                 hidden_size: int,
-                 num_layers: int,
-                 batch_first: bool,
-                 dropout: float,
-                 lookback: int,
-                 lookahead: int,
-                 **kwargs
-                 ):
-        super(SimpleLstmIRMConfig, self).__init__(**kwargs)
-        self.sample_rate = sample_rate
-        self.n_fft = n_fft
-        self.win_length = win_length
-        self.hop_length = hop_length
-        self.num_bins = num_bins
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.batch_first = batch_first
-        self.dropout = dropout
-        self.lookback = lookback
-        self.lookahead = lookahead
-if __name__ == "__main__":
-    pass

toolbox/torchaudio/models/simple_lstm_irm/modeling_simple_lstm_irm.py DELETED Viewed

@@ -1,133 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-"""
-https://github.com/haoxiangsnr/IRM-based-Speech-Enhancement-using-LSTM/blob/master/model/lstm_model.py
-"""
-import os
-from typing import Optional, Union
-import torch
-import torch.nn as nn
-import torchaudio
-from toolbox.torchaudio.models.simple_lstm_irm.configuration_simple_lstm_irm import SimpleLstmIRMConfig
-from toolbox.torchaudio.configuration_utils import CONFIG_FILE
-MODEL_FILE = "model.pt"
-class Transpose(nn.Module):
-    def __init__(self, dim0: int, dim1: int):
-        super(Transpose, self).__init__()
-        self.dim0 = dim0
-        self.dim1 = dim1
-    def forward(self, inputs: torch.Tensor):
-        inputs = torch.transpose(inputs, dim0=self.dim0, dim1=self.dim1)
-        return inputs
-class SimpleLstmIRM(nn.Module):
-    """
-    Ideal ratio mask estimator:
-    """
-    def __init__(self, num_bins=257, hidden_size=1024,
-                 num_layers: int = 2,
-                 batch_first: bool = True,
-                 dropout: float = 0.4,
-                 ):
-        super(SimpleLstmIRM, self).__init__()
-        self.num_bins = num_bins
-        self.hidden_size = hidden_size
-        self.lstm = nn.LSTM(input_size=num_bins,
-                            hidden_size=hidden_size,
-                            num_layers=num_layers,
-                            batch_first=batch_first,
-                            dropout=dropout,
-                            )
-        self.linear = nn.Linear(in_features=hidden_size, out_features=num_bins)
-        self.activation = nn.Sigmoid()
-    def forward(self, spec: torch.Tensor):
-        # spec shape: (batch_size, num_bins, time_steps)
-        spec = torch.transpose(spec, dim0=2, dim1=1)
-        # frame_spec shape: (batch_size, time_steps, num_bins)
-        spec, _ = self.lstm(spec)
-        spec = self.linear(spec)
-        mask = self.activation(spec)
-        return mask
-class SimpleLstmIRMPretrainedModel(SimpleLstmIRM):
-    def __init__(self,
-                 config: SimpleLstmIRMConfig,
-                 ):
-        super(SimpleLstmIRMPretrainedModel, self).__init__(
-            num_bins=config.num_bins,
-            hidden_size=config.hidden_size,
-        )
-        self.config = config
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        config = SimpleLstmIRMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        model = cls(config)
-        if os.path.isdir(pretrained_model_name_or_path):
-            ckpt_file = os.path.join(pretrained_model_name_or_path, MODEL_FILE)
-        else:
-            ckpt_file = pretrained_model_name_or_path
-        with open(ckpt_file, "rb") as f:
-            state_dict = torch.load(f, map_location="cpu", weights_only=True)
-        model.load_state_dict(state_dict, strict=True)
-        return model
-    def save_pretrained(self,
-                        save_directory: Union[str, os.PathLike],
-                        state_dict: Optional[dict] = None,
-                        ):
-        model = self
-        if state_dict is None:
-            state_dict = model.state_dict()
-        os.makedirs(save_directory, exist_ok=True)
-        # save state dict
-        model_file = os.path.join(save_directory, MODEL_FILE)
-        torch.save(state_dict, model_file)
-        # save config
-        config_file = os.path.join(save_directory, CONFIG_FILE)
-        self.config.to_yaml_file(config_file)
-        return save_directory
-def main():
-    transformer = torchaudio.transforms.Spectrogram(
-        n_fft=512,
-        win_length=200,
-        hop_length=80,
-        window_fn=torch.hamming_window,
-    )
-    model = SimpleLstmIRM()
-    inputs = torch.randn(size=(1, 1600), dtype=torch.float32)
-    spec = transformer.forward(inputs)
-    output = model.forward(spec)
-    print(output.shape)
-    print(output)
-    return
-if __name__ == '__main__':
-    main()

toolbox/torchaudio/models/simple_lstm_irm/yaml/config.yaml DELETED Viewed

@@ -1,14 +0,0 @@
-model_name: "simple_lstm_irm"
-# spec
-sample_rate: 8000
-n_fft: 320
-win_length: 320
-hop_length: 80
-# model
-num_bins: 161
-hidden_size: 512
-num_layers: 3
-batch_first: true
-dropout: 0.1

toolbox/torchaudio/modules/conv_stft.py CHANGED Viewed

@@ -59,11 +59,11 @@ class ConvSTFT(nn.Module):
         self.dim = self.nfft
         self.power = power
-    def forward(self, inputs: torch.Tensor):
-        if inputs.dim() == 2:
-            inputs = torch.unsqueeze(inputs, 1)
-        matrix = F.conv1d(inputs, self.weight, stride=self.stride)
         dim = self.dim // 2 + 1
         real = matrix[:, :dim, :]
         imag = matrix[:, dim:, :]
@@ -99,6 +99,8 @@ class ConviSTFT(nn.Module):
         kernel, window = init_kernels(self.nfft, win_size, hop_size, win_type, inverse=True)
         self.weight = nn.Parameter(kernel, requires_grad=requires_grad)
         self.win_size = win_size
         self.hop_size = hop_size
@@ -109,41 +111,158 @@ class ConviSTFT(nn.Module):
         self.register_buffer("window", window)
         self.register_buffer("enframe", torch.eye(win_size)[:, None, :])
     def forward(self,
-                inputs: torch.Tensor):
         """
-        :param inputs: torch.Tensor, shape: [b, f, t]
         :return:
         """
-        inputs = torch.view_as_real(inputs)
-        matrix = torch.concat(tensors=[inputs[..., 0], inputs[..., 1]], dim=1)
         waveform = F.conv_transpose1d(matrix, self.weight, stride=self.stride)
         # this is from torch-stft: https://github.com/pseeth/torch-stft
         t = self.window.repeat(1, 1, matrix.size(-1))**2
         coff = F.conv_transpose1d(t, self.enframe, stride=self.stride)
         waveform = waveform / (coff + 1e-8)
         return waveform
 def main():
-    stft = ConvSTFT(nfft=512, win_size=512, hop_size=200, power=None)
-    istft = ConviSTFT(nfft=512, win_size=512, hop_size=200)
-    mixture = torch.rand(size=(1, 8000*40), dtype=torch.float32)
     spec = stft.forward(mixture)
-    # shape: [batch_size, freq_bins, time_steps]
     print(f"spec.shape: {spec.shape}, spec.dtype: {spec.dtype}")
     waveform = istft.forward(spec)
     # shape: [batch_size, channels, num_samples]
     print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
     return
 if __name__ == "__main__":
-    main()

         self.dim = self.nfft
         self.power = power
+    def forward(self, waveform: torch.Tensor):
+        if waveform.dim() == 2:
+            waveform = torch.unsqueeze(waveform, 1)
+        matrix = F.conv1d(waveform, self.weight, stride=self.stride)
         dim = self.dim // 2 + 1
         real = matrix[:, :dim, :]
         imag = matrix[:, dim:, :]
         kernel, window = init_kernels(self.nfft, win_size, hop_size, win_type, inverse=True)
         self.weight = nn.Parameter(kernel, requires_grad=requires_grad)
+        # weight shape: [f*2, 1, nfft]
+        # f = nfft // 2 + 1
         self.win_size = win_size
         self.hop_size = hop_size
         self.register_buffer("window", window)
         self.register_buffer("enframe", torch.eye(win_size)[:, None, :])
+        # window shape: [1, nfft, 1]
+        # enframe shape: [nfft, 1, nfft]
     def forward(self,
+                spec: torch.Tensor):
         """
+        self.weight shape: [f*2, 1, win_size]
+        self.window shape: [1, win_size, 1]
+        self.enframe shape: [win_size, 1, win_size]
+        :param spec: torch.Tensor, shape: [b, f, t, 2]
         :return:
         """
+        spec = torch.view_as_real(spec)
+        # spec shape: [b, f, t, 2]
+        matrix = torch.concat(tensors=[spec[..., 0], spec[..., 1]], dim=1)
+        # matrix shape: [b, f*2, t]
         waveform = F.conv_transpose1d(matrix, self.weight, stride=self.stride)
+        # waveform shape: [b, 1, num_samples]
         # this is from torch-stft: https://github.com/pseeth/torch-stft
         t = self.window.repeat(1, 1, matrix.size(-1))**2
+        # t shape: [1, win_size, t]
         coff = F.conv_transpose1d(t, self.enframe, stride=self.stride)
+        # coff shape: [1, 1, num_samples]
         waveform = waveform / (coff + 1e-8)
+        # waveform = waveform / coff
         return waveform
+    def forward_chunk(self,
+                      spec: torch.Tensor,
+                      waveform_cache: torch.Tensor = None,
+                      coff_cache: torch.Tensor = None,
+                      ):
+        """
+        :param spec: shape: [b, f, t]
+        :param waveform_cache: shape: [b, 1, win_size - hop_size]
+        :param coff_cache: shape: [b, 1, win_size - hop_size]
+        :return:
+        """
+        spec = torch.view_as_real(spec)
+        matrix = torch.concat(tensors=[spec[..., 0], spec[..., 1]], dim=1)
+        waveform_current = F.conv_transpose1d(matrix, self.weight, stride=self.stride)
+        t = self.window.repeat(1, 1, matrix.size(-1))**2
+        coff_current = F.conv_transpose1d(t, self.enframe, stride=self.stride)
+        overlap_size = self.win_size - self.hop_size
+        if waveform_cache is not None:
+            waveform_overlap = waveform_current[:, :, :overlap_size] + waveform_cache
+            waveform_non_overlap = waveform_current[:, :, overlap_size:-self.hop_size]
+            waveform_output = torch.cat(tensors=[waveform_overlap, waveform_non_overlap], dim=-1)
+            new_waveform_cache = waveform_current[:, :, -self.hop_size:]
+        else:
+            waveform_output = waveform_current[:, :, :-self.hop_size]
+            new_waveform_cache = waveform_current[:, :, -self.hop_size:]
+        if coff_cache is not None:
+            coff_overlap = coff_current[:, :, :overlap_size] + coff_cache
+            coff_non_overlap = coff_current[:, :, overlap_size:-self.hop_size]
+            coff_output = torch.cat(tensors=[coff_overlap, coff_non_overlap], dim=-1)
+            new_coff_cache = coff_current[:, :, -self.hop_size:]
+        else:
+            coff_output = coff_current[:, :, :-self.hop_size]
+            new_coff_cache = coff_current[:, :, -self.hop_size:]
+        waveform_output = waveform_output / (coff_output + 1e-8)
+        return waveform_output, new_waveform_cache, new_coff_cache
 def main():
+    nfft = 512
+    win_size = 512
+    hop_size = 256
+    stft = ConvSTFT(nfft=nfft, win_size=win_size, hop_size=hop_size, power=None)
+    istft = ConviSTFT(nfft=nfft, win_size=win_size, hop_size=hop_size)
+    mixture = torch.rand(size=(1, 16000), dtype=torch.float32)
+    b, num_samples = mixture.shape
+    t = (num_samples - win_size) / hop_size + 1
+    spec = stft.forward(mixture)
+    b, f, t = spec.shape
+    # 如果 spec 是由 stft 变换得来的，以下两种 waveform 还���方法就是一致的，否则还原出的 waveform 会有差异。
+    # spec = spec + 0.01 * torch.randn(size=(1, nfft//2+1, t), dtype=torch.float32)
+    print(f"spec.shape: {spec.shape}, spec.dtype: {spec.dtype}")
+    waveform = istft.forward(spec)
+    # shape: [batch_size, channels, num_samples]
+    print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
+    print(waveform[:, :, 300: 302])
+    waveform = torch.zeros(size=(b, 1, num_samples), dtype=torch.float32)
+    for i in range(int(t)):
+        begin = i * hop_size
+        end = begin + win_size
+        sub_spec = spec[:, :, i:i+1]
+        sub_waveform = istft.forward(sub_spec)
+        # (b, 1, win_size)
+        waveform[:, :, begin:end] = sub_waveform
+    print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
+    print(waveform[:, :, 300: 302])
+    return
+def main2():
+    nfft = 512
+    win_size = 512
+    hop_size = 256
+    stft = ConvSTFT(nfft=nfft, win_size=win_size, hop_size=hop_size, power=None)
+    istft = ConviSTFT(nfft=nfft, win_size=win_size, hop_size=hop_size)
+    mixture = torch.rand(size=(1, 16128), dtype=torch.float32)
+    b, num_samples = mixture.shape
     spec = stft.forward(mixture)
+    b, f, t = spec.shape
+    # 如果 spec 是由 stft 变换得来的，以下两种 waveform 还原方法就是一致的，否则还原出的 waveform 会有差异。
+    spec = spec + 0.01 * torch.randn(size=(1, nfft//2+1, t), dtype=torch.float32)
     print(f"spec.shape: {spec.shape}, spec.dtype: {spec.dtype}")
     waveform = istft.forward(spec)
     # shape: [batch_size, channels, num_samples]
     print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
+    print(waveform[:, :, 300: 302])
+    waveform_cache = None
+    coff_cache = None
+    waveform = torch.zeros(size=(b, 1, num_samples), dtype=torch.float32)
+    for i in range(int(t)):
+        sub_spec = spec[:, :, i:i+1]
+        begin = i * hop_size
+        end = begin + win_size - hop_size
+        sub_waveform, waveform_cache, coff_cache = istft.forward_chunk(sub_spec, waveform_cache, coff_cache)
+        # end = begin + win_size
+        # sub_waveform = istft.forward(sub_spec)
+        waveform[:, :, begin:end] = sub_waveform
+    print(f"waveform.shape: {waveform.shape}, waveform.dtype: {waveform.dtype}")
+    print(waveform[:, :, 300: 302])
     return
 if __name__ == "__main__":
+    main2()

toolbox/torchaudio/modules/freq_bands/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == "__main__":
+    pass

toolbox/torchaudio/modules/freq_bands/erb_bands.py ADDED Viewed

	@@ -0,0 +1,173 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+class ErbBandsNumpy(object):
+    @staticmethod
+    def freq2erb(freq_hz: float) -> float:
+        """
+        https://www.cnblogs.com/LXP-Never/p/16011229.html
+        1 / (24.7 * 9.265) = 0.00436976
+        """
+        return 9.265 * math.log(freq_hz / (24.7 * 9.265) + 1)
+    @staticmethod
+    def erb2freq(n_erb: float) -> float:
+        return 24.7 * 9.265 * (math.exp(n_erb / 9.265) - 1)
+    @classmethod
+    def get_erb_widths(cls, sample_rate: int, nfft: int, erb_bins: int, min_freq_bins_for_erb: int) -> np.ndarray:
+        """
+        https://github.com/Rikorose/DeepFilterNet/blob/main/libDF/src/lib.rs
+        :param sample_rate:
+        :param nfft:
+        :param erb_bins: erb (Equivalent Rectangular Bandwidth) 等效矩形带宽的通道数.
+        :param min_freq_bins_for_erb: Minimum number of frequency bands per erb band
+        :return:
+        """
+        nyq_freq = sample_rate / 2.
+        freq_width: float = sample_rate / nfft
+        min_erb: float = cls.freq2erb(0.)
+        max_erb: float = cls.freq2erb(nyq_freq)
+        erb = [0] * erb_bins
+        step = (max_erb - min_erb) / erb_bins
+        prev_freq_bin = 0
+        freq_over = 0
+        for i in range(1, erb_bins + 1):
+            f = cls.erb2freq(min_erb + i * step)
+            freq_bin = int(round(f / freq_width))
+            freq_bins = freq_bin - prev_freq_bin - freq_over
+            if freq_bins < min_freq_bins_for_erb:
+                freq_over = min_freq_bins_for_erb - freq_bins
+                freq_bins = min_freq_bins_for_erb
+            else:
+                freq_over = 0
+            erb[i - 1] = freq_bins
+            prev_freq_bin = freq_bin
+        erb[erb_bins - 1] += 1
+        too_large = sum(erb) - (nfft / 2 + 1)
+        if too_large > 0:
+            erb[erb_bins - 1] -= too_large
+        return np.array(erb, dtype=np.uint64)
+    @staticmethod
+    def get_erb_filter_bank(erb_widths: np.ndarray,
+                            normalized: bool = True,
+                            inverse: bool = False,
+                            ):
+        num_freq_bins = int(np.sum(erb_widths))
+        num_erb_bins = len(erb_widths)
+        fb: np.ndarray = np.zeros(shape=(num_freq_bins, num_erb_bins))
+        points = np.cumsum([0] + erb_widths.tolist()).astype(int)[:-1]
+        for i, (b, w) in enumerate(zip(points.tolist(), erb_widths.tolist())):
+            fb[b: b + w, i] = 1
+        if inverse:
+            fb = fb.T
+            if not normalized:
+                fb /= np.sum(fb, axis=1, keepdims=True)
+        else:
+            if normalized:
+                fb /= np.sum(fb, axis=0)
+        return fb
+    @staticmethod
+    def spec2erb(spec: np.ndarray, erb_fb: np.ndarray, db: bool = True):
+        """
+        ERB filterbank and transform to decibel scale.
+        :param spec: Spectrum of shape [B, C, T, F].
+        :param erb_fb: ERB filterbank array of shape [B] containing the ERB widths,
+                where B are the number of ERB bins.
+        :param db: Whether to transform the output into decibel scale. Defaults to `True`.
+        :return:
+        """
+        # complex spec to power spec. (real * real + image * image)
+        spec_ = np.abs(spec) ** 2
+        # spec to erb feature.
+        erb_feat = np.matmul(spec_, erb_fb)
+        if db:
+            erb_feat = 10 * np.log10(erb_feat + 1e-10)
+        erb_feat = np.array(erb_feat, dtype=np.float32)
+        return erb_feat
+class ErbBands(nn.Module):
+    def __init__(self,
+                 sample_rate: int = 8000,
+                 nfft: int = 512,
+                 erb_bins: int = 32,
+                 min_freq_bins_for_erb: int = 2,
+                 ):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.nfft = nfft
+        self.erb_bins = erb_bins
+        self.min_freq_bins_for_erb = min_freq_bins_for_erb
+        erb_fb, erb_fb_inv = self.init_erb_fb()
+        self.erb_fb = torch.tensor(erb_fb, dtype=torch.float32, requires_grad=False)
+        self.erb_fb_inv = torch.tensor(erb_fb_inv, dtype=torch.float32, requires_grad=False)
+    def init_erb_fb(self):
+        erb_widths = ErbBandsNumpy.get_erb_widths(
+            sample_rate=self.sample_rate,
+            nfft=self.nfft,
+            erb_bins=self.erb_bins,
+            min_freq_bins_for_erb=self.min_freq_bins_for_erb,
+        )
+        erb_fb = ErbBandsNumpy.get_erb_filter_bank(
+            erb_widths=erb_widths,
+            normalized=True,
+            inverse=False,
+        )
+        erb_fb_inv = ErbBandsNumpy.get_erb_filter_bank(
+            erb_widths=erb_widths,
+            normalized=True,
+            inverse=True,
+        )
+        return erb_fb, erb_fb_inv
+    def erb_scale(self, spec: torch.Tensor, db: bool = True):
+        spec_erb = torch.matmul(spec, self.erb_fb)
+        if db:
+            spec_erb = 10 * torch.log10(spec_erb + 1e-10)
+        return spec_erb
+    def erb_scale_inv(self, spec_erb: torch.Tensor):
+        spec = torch.matmul(spec_erb, self.erb_fb_inv)
+        return spec
+def main():
+    erb_bands = ErbBands()
+    spec = torch.randn(size=(2, 199, 257), dtype=torch.float32)
+    spec_erb = erb_bands.erb_scale(spec)
+    print(spec_erb.shape)
+    spec = erb_bands.erb_scale_inv(spec_erb)
+    print(spec.shape)
+    return
+if __name__ == "__main__":
+    main()