Spaces:

DeepLearning101
/

Speech-Separation

Running

File size: 5,884 Bytes

import gradio as gr
import torch
import os
import soundfile as sf
import numpy as np
import librosa
import warnings
import tempfile
from DPTNet_eval.DPTNet_quant_sep import load_dpt_model, dpt_sep_process

# 過濾警告訊息
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# 加載模型（全局變量）
model = load_dpt_model()

def separate_audio(input_wav):
    """處理音訊分離的主要函數"""
    try:
        # 步驟 1：讀取音訊並標準化格式
        data, sr = librosa.load(input_wav, sr=None, mono=True)
        
        # 步驟 2：強制重採樣到 16kHz
        if sr != 16000:
            data = librosa.resample(data, orig_sr=sr, target_sr=16000)
            sr = 16000
        
        # 步驟 3：生成唯一臨時檔案
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
            temp_wav = tmp_file.name
            sf.write(temp_wav, data, sr, subtype='PCM_16')

        # 步驟 4：執行語音分離
        outfilename = "output.wav"
        dpt_sep_process(temp_wav, model=model, outfilename=outfilename)
        
        # 步驟 5：清理臨時檔案
        os.remove(temp_wav)

        # 步驟 6：驗證輸出檔案存在
        output_files = [
            outfilename.replace('.wav', '_sep1.wav'),
            outfilename.replace('.wav', '_sep2.wav')
        ]
        if not all(os.path.exists(f) for f in output_files):
            raise gr.Error("分離過程中發生錯誤，請檢查輸入檔案格式！")

        return output_files

    except Exception as e:
        # 錯誤處理
        error_msg = f"處理失敗：{str(e)}"
        raise gr.Error(error_msg) from e

# 🎯 你提供的 description 內容（已轉為 HTML）
description_html = """
<h1 align='center'><a href='https://www.twman.org/AI/ASR/SpeechSeparation' target='_blank'>中文語者分離(分割)</a></h1>
<p align='center'><b>上傳一段混音音檔，自動分離出兩個人的聲音</b></p>

<div align='center'>
  <a href='https://www.twman.org' target='_blank'>TonTon Huang Ph.D.</a> |
  <a href='https://blog.twman.org/p/deeplearning101.html' target='_blank'>手把手帶你一起踩AI坑</a> |
  <a href='https://github.com/Deep-Learning-101' target='_blank'>GitHub</a> |
  <a href='http://deeplearning101.twman.org' target='_blank'>Deep Learning 101</a>
</div>

<br>

### 🔍 使用方式：
- 上傳一段包含兩人對話的混音音檔（支援 `.mp3`, `.wav`）
- 點擊「Separate」按鈕
- 分離出兩個說話人的音軌

<br>

### 📘 相關技術文章：
<ul>
  <li><a href='https://blog.twman.org/2025/03/AIAgent.html' target='_blank'>避開 AI Agent 開發陷阱：常見問題、挑戰與解決方案 (那些 AI Agent 實戰踩過的坑)</a>：探討多種 AI Agent 工具的應用經驗與挑戰</li>
  <li><a href='https://blog.twman.org/2024/08/LLM.html' target='_blank'>白話文手把手帶你科普 GenAI</a>：淺顯介紹生成式人工智慧核心概念</li>
  <li><a href='https://blog.twman.org/2024/09/LLM.html' target='_blank'>大型語言模型直接就打完收工？</a>：回顧 LLM 領域探索歷程</li>
  <li><a href='https://blog.twman.org/2024/07/RAG.html' target='_blank'>檢索增強生成 (Retrieval-Augmented Generation, RAG) 不是萬靈丹之優化挑戰技巧</a>：探討 RAG 技術應用與挑戰</li>
  <li><a href='https://blog.twman.org/2024/02/LLM.html' target='_blank'>大型語言模型 (LLM) 入門完整指南：原理、應用與未來</a>：探討多種 LLM 工具的應用與挑戰</li>
  <li><a href='https://blog.twman.org/2023/04/GPT.html' target='_blank'>什麼是大語言模型，它是什麼？想要嗎？(Large Language Model，LLM)</a>：探討 LLM 的發展與應用</li>
  <li><a href='https://blog.twman.org/2024/11/diffusion.html' target='_blank'>ComfyUI + Stable Diffuision</a>：深入探討影像生成與分割技術的應用</li>
  <li><a href='https://blog.twman.org/2024/02/asr-tts.html' target='_blank'>ASR/TTS 開發避坑指南：語音辨識與合成的常見挑戰與對策</a>：探討 ASR 和 TTS 技術應用中的問題</li>
  <li><a href='https://blog.twman.org/2021/04/NLP.html' target='_blank'>那些自然語言處理 (NLP) 踩的坑</a>：分享 NLP 領域的實踐經驗</li>
  <li><a href='https://blog.twman.org/2021/04/ASR.html' target='_blank'>那些語音處理 (Speech Processing) 踩的坑</a>：分享語音處理領域的實務經驗</li>
  <li><a href='https://blog.twman.org/2023/07/wsl.html' target='_blank'>用PPOCRLabel來幫PaddleOCR做OCR的微調和標註</a></li>
  <li><a href='https://blog.twman.org/2023/07/HugIE.html' target='_blank'>基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析</a></li>
  <li><a href='https://github.com/shibing624/pycorrector' target='_blank'>Masked Language Model (MLM) as correction BERT</a></li>
</ul>

<br><br>

📢 *本模型基於 PyTorch + Hugging Face Hub 私有模型部署*
"""

if __name__ == "__main__":
    # 配置 Gradio 介面
    interface = gr.Interface(
        fn=separate_audio,
        inputs=gr.Audio(
            type="filepath",
            label="請上傳混音音檔 (支援格式：mp3/wav/ogg)",
            max_length=300  # 限制 5 分鐘長度
        ),
        outputs=[
            gr.Audio(label="語音軌道 1"),
            gr.Audio(label="語音軌道 2")
        ],
        title="🎙️ 語音分離 Demo - Deep Learning 101",
        description=description_html,
        allow_flagging="never",
        examples=[
            [os.path.join("examples", "sample1.wav")],
            [os.path.join("examples", "sample2.mp3")]
        ]
    )

    # 啟動服務
    interface.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        debug=False
    )