|
import gradio as gr |
|
import torch |
|
import torchaudio |
|
import os |
|
import tempfile |
|
import logging |
|
import traceback |
|
from datetime import datetime |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s' |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
if not os.getenv("SpeechSeparation"): |
|
logger.warning("⚠️ 環境變數 SpeechSeparation 未設定!請在 Hugging Face Space 的 Secrets 中設定 HF_TOKEN") |
|
|
|
|
|
try: |
|
logger.info("🔧 開始載入語音分離模型...") |
|
from DPTNet_eval.DPTNet_quant_sep import load_dpt_model, dpt_sep_process |
|
logger.info("✅ 模型模組載入成功") |
|
except ImportError as e: |
|
logger.error(f"❌ 模組載入失敗: {str(e)}") |
|
raise RuntimeError("本地模組路徑配置錯誤") from e |
|
|
|
|
|
try: |
|
logger.info("🔄 初始化模型中...") |
|
model = load_dpt_model() |
|
logger.info(f"🧠 模型載入完成,運行設備: {'GPU' if torch.cuda.is_available() else 'CPU'}") |
|
except Exception as e: |
|
logger.error(f"💣 模型初始化失敗: {str(e)}") |
|
raise RuntimeError("模型載入異常終止") from e |
|
|
|
def validate_audio(path): |
|
"""驗證音檔格式與內容有效性""" |
|
try: |
|
info = torchaudio.info(path) |
|
logger.info(f"🔊 音檔資訊: 采樣率={info.sample_rate}Hz, 通道數={info.num_channels}") |
|
|
|
if info.num_channels not in [1, 2]: |
|
raise gr.Error("❌ 不支援的音檔通道數(僅支援單聲道或立體聲)") |
|
|
|
if info.sample_rate < 8000 or info.sample_rate > 48000: |
|
raise gr.Error("❌ 不支援的采樣率(需介於 8kHz~48kHz)") |
|
|
|
return info.sample_rate |
|
except Exception as e: |
|
logger.error(f"⚠️ 音檔驗證失敗: {str(e)}") |
|
raise gr.Error("❌ 無效的音訊檔案格式") |
|
|
|
def convert_to_wav(input_path): |
|
"""統一轉換為 16kHz WAV 格式""" |
|
try: |
|
|
|
waveform, sample_rate = torchaudio.load(input_path) |
|
|
|
|
|
if waveform.shape[0] > 1: |
|
waveform = torch.mean(waveform, dim=0, keepdim=True) |
|
|
|
|
|
if sample_rate != 16000: |
|
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) |
|
waveform = resampler(waveform) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile: |
|
torchaudio.save(tmpfile.name, waveform, 16000, bits_per_sample=16) |
|
logger.info(f"📝 已生成標準 WAV 檔案: {tmpfile.name}") |
|
return tmpfile.name |
|
|
|
except Exception as e: |
|
logger.error(f"⚠️ 音檔轉換失敗: {str(e)}") |
|
raise gr.Error("❌ 音訊格式轉換失敗") |
|
|
|
def separate_audio(input_audio): |
|
process_id = datetime.now().strftime("%Y%m%d%H%M%S%f") |
|
temp_wav = None |
|
|
|
try: |
|
logger.info(f"[{process_id}] 🚀 收到新請求: {input_audio}") |
|
|
|
|
|
if os.path.getsize(input_audio) > 50 * 1024 * 1024: |
|
raise gr.Error("❌ 檔案超過 50MB 限制") |
|
|
|
|
|
logger.info(f"[{process_id}] 🔁 轉換標準音檔...") |
|
temp_wav = convert_to_wav(input_audio) |
|
validate_audio(temp_wav) |
|
|
|
|
|
output_dir = os.path.join("/tmp/gradio_outputs", process_id) |
|
os.makedirs(output_dir) |
|
outfilename = os.path.join(output_dir, "output.wav") |
|
|
|
|
|
logger.info(f"[{process_id}] 🧠 開始分離...") |
|
sep_files = dpt_sep_process(temp_wav, model=model, outfilename=outfilename) |
|
|
|
|
|
for f in sep_files: |
|
if not os.path.exists(f): |
|
raise gr.Error(f"❌ 缺失輸出檔案: {f}") |
|
validate_audio(f) |
|
|
|
logger.info(f"[{process_id}] ✅ 處理完成") |
|
return sep_files |
|
|
|
except RuntimeError as e: |
|
if "CUDA out of memory" in str(e): |
|
logger.error(f"[{process_id}] 💥 GPU 記憶體不足") |
|
raise gr.Error("⚠️ 請縮短音檔長度") from e |
|
else: |
|
raise |
|
except Exception as e: |
|
logger.error(f"[{process_id}] ❌ 處理失敗: {str(e)}\n{traceback.format_exc()}") |
|
raise gr.Error(f"⚠️ 處理失敗: {str(e)}") from e |
|
finally: |
|
|
|
if temp_wav and os.path.exists(temp_wav): |
|
os.unlink(temp_wav) |
|
logger.info(f"[{process_id}] 🧹 臨時檔案已清理") |
|
|
|
|
|
description_html = """ |
|
<h1 align='center'><a href='https://www.twman.org/AI/ASR/SpeechSeparation' target='_blank'>中文語者分離(分割)</a></h1> |
|
<div align='center'> |
|
<a href='https://www.twman.org' target='_blank'>TonTon Huang Ph.D.</a> | |
|
<a href='https://www.twman.org/AI' target='_blank'> AI </a> | |
|
<a href='https://blog.twman.org/p/deeplearning101.html' target='_blank'>手把手帶你一起踩AI坑</a> | |
|
<a href='https://github.com/Deep-Learning-101' target='_blank'>GitHub</a> | |
|
<a href='http://deeplearning101.twman.org' target='_blank'>Deep Learning 101</a> | |
|
<a href='https://www.youtube.com/c/DeepLearning101' target='_blank'>YouTube</a> |
|
</div> |
|
<br> |
|
<ul> |
|
<li><a href='https://blog.twman.org/2025/03/AIAgent.html' target='_blank'>避開 AI Agent 開發陷阱:常見問題、挑戰與解決方案 (那些 AI Agent 實戰踩過的坑)</a>:探討多種 AI Agent 工具的應用經驗與挑戰</li> |
|
<li><a href='https://blog.twman.org/2024/08/LLM.html' target='_blank'>白話文手把手帶你科普 GenAI</a>:淺顯介紹生成式人工智慧核心概念</li> |
|
<li><a href='https://blog.twman.org/2024/09/LLM.html' target='_blank'>大型語言模型直接就打完收工?</a>:回顧 LLM 領域探索歷程</li> |
|
<li><a href='https://blog.twman.org/2024/07/RAG.html' target='_blank'>檢索增強生成 (Retrieval-Augmented Generation, RAG) 不是萬靈丹之優化挑戰技巧</a>:探討 RAG 技術應用與挑戰</li> |
|
<li><a href='https://blog.twman.org/2024/02/LLM.html' target='_blank'>大型語言模型 (LLM) 入門完整指南:原理、應用與未來</a>:探討多種 LLM 工具的應用與挑戰</li> |
|
<li><a href='https://blog.twman.org/2023/04/GPT.html' target='_blank'>什麼是大語言模型,它是什麼?想要嗎?(Large Language Model,LLM)</a>:探討 LLM 的發展與應用</li> |
|
<li><a href='https://blog.twman.org/2024/11/diffusion.html' target='_blank'>ComfyUI + Stable Diffuision</a>:深入探討影像生成與分割技術的應用</li> |
|
<li><a href='https://blog.twman.org/2024/02/asr-tts.html' target='_blank'>ASR/TTS 開發避坑指南:語音辨識與合成的常見挑戰與對策</a>:探討 ASR 和 TTS 技術應用中的問題</li> |
|
<li><a href='https://blog.twman.org/2021/04/NLP.html' target='_blank'>那些自然語言處理 (NLP) 踩的坑</a>:分享 NLP 領域的實踐經驗</li> |
|
<li><a href='https://blog.twman.org/2021/04/ASR.html' target='_blank'>那些語音處理 (Speech Processing) 踩的坑</a>:分享語音處理領域的實務經驗</li> |
|
<li><a href='https://blog.twman.org/2023/07/wsl.html' target='_blank'>用PPOCRLabel來幫PaddleOCR做OCR的微調和標註</a></li> |
|
<li><a href='https://blog.twman.org/2023/07/HugIE.html' target='_blank'>基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析</a></li> |
|
</ul> |
|
<br> |
|
""" |
|
|
|
EXAMPLES = [ |
|
["examples/sample1.wav"], |
|
["examples/sample2.wav"] |
|
] |
|
|
|
AUDIO_INPUT = gr.Audio( |
|
label="🔊 上傳混合音檔", |
|
type="filepath", |
|
sources=["upload", "microphone"], |
|
show_label=True, |
|
max_length=180 |
|
) |
|
|
|
|
|
AUDIO_OUTPUTS = [ |
|
gr.Audio(label="🗣️ 語音軌道 1", type="filepath", format="wav"), |
|
gr.Audio(label="🗣️ 語音軌道 2", type="filepath", format="wav") |
|
] |
|
|
|
|
|
interface = gr.Interface( |
|
fn=separate_audio, |
|
inputs=AUDIO_INPUT, |
|
outputs=AUDIO_OUTPUTS, |
|
title="🎙️ 語音分離,上傳一段混音音檔(支援.mp3, .wav),自動分離出兩個人的聲音;Deep Learning 101", |
|
description=description_html, |
|
examples=EXAMPLES, |
|
allow_flagging="never", |
|
cache_examples=False, |
|
theme="default" |
|
) |
|
|
|
LAUNCH_CONFIG = { |
|
"server_name": "0.0.0.0", |
|
"server_port": int(os.environ.get("PORT", 7860)), |
|
"share": False, |
|
"debug": True, |
|
"auth": None, |
|
"inbrowser": True, |
|
"quiet": False |
|
} |
|
|
|
if __name__ == "__main__": |
|
logger.info("🚀 啟動 Gradio 服務...") |
|
interface.launch(**LAUNCH_CONFIG) |