Spaces:

DeepLearning101
/

Speech-Separation

Running

File size: 3,469 Bytes

64ceedd
 
 
 
cf73d23
 
38d7181
 
 
64ceedd
d8be50a
64ceedd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38d7181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64ceedd
38d7181
 
cf73d23
38d7181
64ceedd
cf73d23
d8be50a
64ceedd
 
38d7181
b75ae28
 
64ceedd
b75ae28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38d7181
64ceedd
 
b6c45cb

import os
import torch
import numpy as np
import torchaudio
import yaml
from . import asteroid_test
from huggingface_hub import hf_hub_download

torchaudio.set_audio_backend("sox_io")


def get_conf():
    conf_filterbank = {
        'n_filters': 64,
        'kernel_size': 16,
        'stride': 8
    }

    conf_masknet = {
        'in_chan': 64,
        'n_src': 2,
        'out_chan': 64,
        'ff_hid': 256,
        'ff_activation': "relu",
        'norm_type': "gLN",
        'chunk_size': 100,
        'hop_size': 50,
        'n_repeats': 2,
        'mask_act': 'sigmoid',
        'bidirectional': True,
        'dropout': 0
    }
    return conf_filterbank, conf_masknet


def load_dpt_model():
    print('Load Separation Model...')

    # 👇 從環境變數取得 HF Token
    from huggingface_hub import hf_hub_download
    speech_sep_token = os.getenv("SpeechSeparation")
    if not speech_sep_token:
        raise EnvironmentError("環境變數 SpeechSeparation 未設定！")

    # 👇 從 Hugging Face Hub 下載模型權重
    model_path = hf_hub_download(
        repo_id="DeepLearning101/speech-separation",  # 替換成你自己的 repo 名稱
        filename="train_dptnet_aishell_partOverlap_B2_300epoch_quan-int8.p",
        token=speech_sep_token
    )

    # 👇 原本邏輯完全不變
    conf_filterbank, conf_masknet = get_conf()
    model_class = getattr(asteroid_test, "DPTNet")
    model = model_class(**conf_filterbank, **conf_masknet)
    model = torch.quantization.quantize_dynamic(model, {torch.nn.LSTM, torch.nn.Linear}, dtype=torch.qint8)

    state_dict = torch.load(model_path, map_location="cpu")
    model.load_state_dict(state_dict)
    model.eval()
    return model


import torchaudio
import tempfile

def dpt_sep_process(wav_path, model=None, outfilename=None):
    try:
        if model is None:
            model = load_dpt_model()

        # 使用 torchaudio 的通用加載方法
        x, sr = torchaudio.load(wav_path, format=wav_path.split('.')[-1])
        x = x.mean(dim=0, keepdim=True)  # 強制轉單聲道

        # 自動重採樣處理
        if sr != 16000:
            resampler = torchaudio.transforms.Resample(sr, 16000)
            x = resampler(x)
            sr = 16000

        with torch.no_grad():
            est_sources = model(x)

        # 後處理修正
        est_sources = est_sources.squeeze(0)
        sep_1, sep_2 = est_sources[0], est_sources[1]

        # 正規化增強
        peak = 0.9 * torch.max(torch.abs(x))
        sep_1 = peak * sep_1 / torch.max(torch.abs(sep_1))
        sep_2 = peak * sep_2 / torch.max(torch.abs(sep_2))

        # 使用臨時輸出目錄
        with tempfile.TemporaryDirectory() as tmp_dir:
            sep1_path = os.path.join(tmp_dir, "sep1.wav")
            sep2_path = os.path.join(tmp_dir, "sep2.wav")
            
            torchaudio.save(sep1_path, sep_1.unsqueeze(0), sr)
            torchaudio.save(sep2_path, sep_2.unsqueeze(0), sr)

            # 移動檔案到最終位置
            final_sep1 = outfilename.replace('.wav', '_sep1.wav')
            final_sep2 = outfilename.replace('.wav', '_sep2.wav')
            os.replace(sep1_path, final_sep1)
            os.replace(sep2_path, final_sep2)

        return final_sep1, final_sep2

    except Exception as e:
        raise RuntimeError(f"分離過程錯誤: {str(e)}") from e


if __name__ == '__main__':
    print("This module should be used via Flask or Gradio.")