Spaces:

DeepLearning101
/

Speech-Separation

Running

File size: 2,923 Bytes

64ceedd
 
 
 
cf73d23
 
38d7181
 
 
64ceedd
d8be50a
64ceedd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38d7181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64ceedd
38d7181
 
cf73d23
38d7181
64ceedd
cf73d23
d8be50a
64ceedd
 
38d7181
64ceedd
 
38d7181
64ceedd
 
 
 
 
 
 
cf73d23
64ceedd
cf73d23
 
64ceedd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38d7181
64ceedd
 
b6c45cb

import os
import torch
import numpy as np
import torchaudio
import yaml
from . import asteroid_test
from huggingface_hub import hf_hub_download

torchaudio.set_audio_backend("sox_io")


def get_conf():
    conf_filterbank = {
        'n_filters': 64,
        'kernel_size': 16,
        'stride': 8
    }

    conf_masknet = {
        'in_chan': 64,
        'n_src': 2,
        'out_chan': 64,
        'ff_hid': 256,
        'ff_activation': "relu",
        'norm_type': "gLN",
        'chunk_size': 100,
        'hop_size': 50,
        'n_repeats': 2,
        'mask_act': 'sigmoid',
        'bidirectional': True,
        'dropout': 0
    }
    return conf_filterbank, conf_masknet


def load_dpt_model():
    print('Load Separation Model...')

    # 👇 從環境變數取得 HF Token
    from huggingface_hub import hf_hub_download
    speech_sep_token = os.getenv("SpeechSeparation")
    if not speech_sep_token:
        raise EnvironmentError("環境變數 SpeechSeparation 未設定！")

    # 👇 從 Hugging Face Hub 下載模型權重
    model_path = hf_hub_download(
        repo_id="DeepLearning101/speech-separation",  # 替換成你自己的 repo 名稱
        filename="train_dptnet_aishell_partOverlap_B2_300epoch_quan-int8.p",
        token=speech_sep_token
    )

    # 👇 原本邏輯完全不變
    conf_filterbank, conf_masknet = get_conf()
    model_class = getattr(asteroid_test, "DPTNet")
    model = model_class(**conf_filterbank, **conf_masknet)
    model = torch.quantization.quantize_dynamic(model, {torch.nn.LSTM, torch.nn.Linear}, dtype=torch.qint8)

    state_dict = torch.load(model_path, map_location="cpu")
    model.load_state_dict(state_dict)
    model.eval()
    return model


def dpt_sep_process(wav_path, model=None, outfilename=None):
    if model is None:
        model = load_dpt_model()

    x, sr = torchaudio.load(wav_path)
    x = x.cpu()

    with torch.no_grad():
        est_sources = model(x)  # shape: (1, 2, T)

    # 確保 est_sources 是 (1, 2, T)，再拆分
    est_sources = est_sources.squeeze(0)  # shape: (2, T)

    sep_1, sep_2 = est_sources  # 拆成兩個 (T, ) 的 tensor

    # 正規化
    max_abs = x[0].abs().max().item()
    sep_1 = sep_1 * max_abs / sep_1.abs().max().item()
    sep_2 = sep_2 * max_abs / sep_2.abs().max().item()

    # 增加 channel 維度，變為 (1, T)
    sep_1 = sep_1.unsqueeze(0)
    sep_2 = sep_2.unsqueeze(0)

    if outfilename is not None:
        torchaudio.save(outfilename.replace('.wav', '_sep1.wav'), sep_1, sr)
        torchaudio.save(outfilename.replace('.wav', '_sep2.wav'), sep_2, sr)
        torchaudio.save(outfilename.replace('.wav', '_mix.wav'), x, sr)
    else:
        torchaudio.save(wav_path.replace('.wav', '_sep1.wav'), sep_1, sr)
        torchaudio.save(wav_path.replace('.wav', '_sep2.wav'), sep_2, sr)


if __name__ == '__main__':
    print("This module should be used via Flask or Gradio.")