Spaces:

DeepLearning101
/

Speech-Separation

Running

File size: 7,746 Bytes

40390b1
 
bb38b9e
 
 
c6c5dd9
e56b358
c6c5dd9
 
e56b358
40390b1
c6c5dd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40390b1
 
c6c5dd9
 
 
 
e56b358
c6c5dd9
 
 
 
 
 
 
 
 
 
e56b358
 
c6c5dd9
bb38b9e
c6c5dd9
bb38b9e
e56b358
c6c5dd9
 
e56b358
 
c6c5dd9
e56b358
 
c6c5dd9
 
 
 
 
e56b358
 
c6c5dd9
e56b358
 
 
 
c6c5dd9
 
 
e56b358
c6c5dd9
 
e56b358
c6c5dd9
e56b358
 
 
c6c5dd9
 
 
 
 
 
 
 
 
 
 
 
40390b1
 
 
 
fd0c313
40390b1
 
fd0c313
 
40390b1
 
fd0c313
 
40390b1
 
bb38b9e
40390b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd0c313
40390b1
 
 
013222f
40390b1
013222f
 
 
 
 
fbc7d22
013222f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40390b1
013222f
 
 
f631c4e
 
 
013222f
f631c4e
013222f
 
 
 
f631c4e
013222f
f631c4e

import gradio as gr
import torch
import os
import soundfile as sf
import librosa
import logging
import tempfile
import traceback
from datetime import datetime
from DPTNet_eval.DPTNet_quant_sep import load_dpt_model, dpt_sep_process

# 配置日志系统
logging.basicConfig(
    filename='app.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# 全局模型加载（避免重复加载）
try:
    logger.info("開始加載語音分離模型...")
    model = load_dpt_model()
    logger.info("模型加載成功")
except Exception as e:
    logger.error(f"模型加載失敗: {str(e)}")
    raise RuntimeError("模型初始化失敗") from e

def separate_audio(input_wav):
    """處理音訊分離的主函數"""
    process_id = datetime.now().strftime("%Y%m%d%H%M%S%f")
    temp_wav = None
    
    try:
        logger.info(f"[{process_id}] 開始處理檔案: {input_wav}")
        
        # 1. 驗證輸入檔案
        if not os.path.exists(input_wav):
            raise gr.Error("檔案不存在，請重新上傳")
        if os.path.getsize(input_wav) > 50 * 1024 * 1024:  # 50MB限制
            raise gr.Error("檔案大小超過50MB限制")

        # 2. 讀取並標準化音訊
        logger.info(f"[{process_id}] 讀取音訊檔案...")
        data, sr = librosa.load(input_wav, sr=None, mono=True)
        
        # 3. 重採樣處理
        if sr != 16000:
            logger.info(f"[{process_id}] 重採樣從 {sr}Hz 到 16000Hz...")
            data = librosa.resample(data, orig_sr=sr, target_sr=16000)
            sr = 16000

        # 4. 創建臨時檔案
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
            temp_wav = tmp_file.name
            logger.info(f"[{process_id}] 寫入臨時檔案: {temp_wav}")
            sf.write(temp_wav, data, sr, subtype='PCM_16')

        # 5. 執行語音分離
        logger.info(f"[{process_id}] 開始語音分離...")
        out_dir = tempfile.mkdtemp()  # 使用臨時目錄存放輸出
        outfilename = os.path.join(out_dir, "output.wav")
        
        dpt_sep_process(temp_wav, model=model, outfilename=outfilename)
        
        # 6. 獲取輸出檔案
        output_files = [
            outfilename.replace('.wav', '_sep1.wav'),
            outfilename.replace('.wav', '_sep2.wav')
        ]
        logger.info(f"[{process_id}] 預期輸出檔案: {output_files}")

        # 7. 驗證輸出
        if not all(os.path.exists(f) for f in output_files):
            missing = [f for f in output_files if not os.path.exists(f)]
            raise gr.Error(f"分離失敗，缺失檔案: {missing}")

        logger.info(f"[{process_id}] 處理完成")
        return output_files

    except Exception as e:
        error_msg = f"[{process_id}] 處理錯誤: {str(e)}\n{traceback.format_exc()}"
        logger.error(error_msg)
        raise gr.Error(f"處理失敗: {str(e)}") from e

    finally:
        # 清理臨時檔案
        if temp_wav and os.path.exists(temp_wav):
            try:
                os.remove(temp_wav)
                logger.info(f"[{process_id}] 已清理臨時檔案")
            except Exception as clean_err:
                logger.warning(f"[{process_id}] 清理失敗: {str(clean_err)}")

# 🎯 你提供的 description 內容（已轉為 HTML）
description_html = """
<h1 align='center'><a href='https://www.twman.org/AI/ASR/SpeechSeparation' target='_blank'>中文語者分離(分割)</a></h1>
<p align='center'><b>上傳一段混音音檔 （支援 `.mp3`, `.wav`），自動分離出兩個人的聲音</b></p>

<div align='center'>
  <a href='https://www.twman.org' target='_blank'>TonTon Huang Ph.D.</a> | 
  <a href='https://www.twman.org/AI' target='_blank'> AI </a> |
  <a href='https://blog.twman.org/p/deeplearning101.html' target='_blank'>手把手帶你一起踩AI坑</a> |
  <a href='https://github.com/Deep-Learning-101' target='_blank'>GitHub</a> |
  <a href='http://deeplearning101.twman.org' target='_blank'>Deep Learning 101</a> |
  <a href='https://www.youtube.com/c/DeepLearning101' target='_blank'>YouTube</a>  
</div>

<br>

### 📘 相關技術文章：
<ul>
  <li><a href='https://blog.twman.org/2025/03/AIAgent.html' target='_blank'>避開 AI Agent 開發陷阱：常見問題、挑戰與解決方案 (那些 AI Agent 實戰踩過的坑)</a>：探討多種 AI Agent 工具的應用經驗與挑戰</li>
  <li><a href='https://blog.twman.org/2024/08/LLM.html' target='_blank'>白話文手把手帶你科普 GenAI</a>：淺顯介紹生成式人工智慧核心概念</li>
  <li><a href='https://blog.twman.org/2024/09/LLM.html' target='_blank'>大型語言模型直接就打完收工？</a>：回顧 LLM 領域探索歷程</li>
  <li><a href='https://blog.twman.org/2024/07/RAG.html' target='_blank'>檢索增強生成 (Retrieval-Augmented Generation, RAG) 不是萬靈丹之優化挑戰技巧</a>：探討 RAG 技術應用與挑戰</li>
  <li><a href='https://blog.twman.org/2024/02/LLM.html' target='_blank'>大型語言模型 (LLM) 入門完整指南：原理、應用與未來</a>：探討多種 LLM 工具的應用與挑戰</li>
  <li><a href='https://blog.twman.org/2023/04/GPT.html' target='_blank'>什麼是大語言模型，它是什麼？想要嗎？(Large Language Model，LLM)</a>：探討 LLM 的發展與應用</li>
  <li><a href='https://blog.twman.org/2024/11/diffusion.html' target='_blank'>ComfyUI + Stable Diffuision</a>：深入探討影像生成與分割技術的應用</li>
  <li><a href='https://blog.twman.org/2024/02/asr-tts.html' target='_blank'>ASR/TTS 開發避坑指南：語音辨識與合成的常見挑戰與對策</a>：探討 ASR 和 TTS 技術應用中的問題</li>
  <li><a href='https://blog.twman.org/2021/04/NLP.html' target='_blank'>那些自然語言處理 (NLP) 踩的坑</a>：分享 NLP 領域的實踐經驗</li>
  <li><a href='https://blog.twman.org/2021/04/ASR.html' target='_blank'>那些語音處理 (Speech Processing) 踩的坑</a>：分享語音處理領域的實務經驗</li>
  <li><a href='https://blog.twman.org/2023/07/wsl.html' target='_blank'>用PPOCRLabel來幫PaddleOCR做OCR的微調和標註</a></li>
  <li><a href='https://blog.twman.org/2023/07/HugIE.html' target='_blank'>基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析</a></li>
  <li><a href='https://github.com/shibing624/pycorrector' target='_blank'>Masked Language Model (MLM) as correction BERT</a></li>
</ul>

<br>
"""

if __name__ == "__main__":
    # 完整配置 Gradio 接口
    interface = gr.Interface(
        fn=separate_audio,
        inputs=gr.Audio(
            type="filepath",
            label="請上傳混音音檔 (支援格式: mp3/wav/ogg)",
            sources=["upload", "microphone"],
            max_length=180
        ),
        outputs=[
            gr.Audio(label="語音軌道 1", format="wav"),
            gr.Audio(label="語音軌道 2", format="wav")
        ],
        title="🎙️ 語音分離 Demo - Deep Learning 101",
        description=description_html,  # 直接使用HTML描述
        flagging_mode="never",
        allow_flagging="never",
        allow_screenshot=False,
        live=True,
        examples=[
            ["examples/sample1.wav"],
            ["examples/sample2.mp3"]
        ],
        theme="default"
    )
 
    interface.queue(concurrency_count=2)
 
    launch_kwargs = {
        "server_name": "0.0.0.0",
        "server_port": 7860,
        "share": False,
        "debug": False,
        "auth": None,
        "inbrowser": True,
        "quiet": False,
        "prevent_thread_lock": True
    }
 
    interface.launch(**launch_kwargs)