|
import gradio as gr |
|
import torch |
|
import os |
|
import soundfile as sf |
|
import librosa |
|
import logging |
|
import tempfile |
|
import traceback |
|
from datetime import datetime |
|
from DPTNet_eval.DPTNet_quant_sep import load_dpt_model, dpt_sep_process |
|
|
|
|
|
logging.basicConfig( |
|
filename='app.log', |
|
level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s' |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
try: |
|
logger.info("開始加載語音分離模型...") |
|
model = load_dpt_model() |
|
logger.info("模型加載成功") |
|
except Exception as e: |
|
logger.error(f"模型加載失敗: {str(e)}") |
|
raise RuntimeError("模型初始化失敗") from e |
|
|
|
def separate_audio(input_wav): |
|
"""處理音訊分離的主函數""" |
|
process_id = datetime.now().strftime("%Y%m%d%H%M%S%f") |
|
temp_wav = None |
|
|
|
try: |
|
logger.info(f"[{process_id}] 開始處理檔案: {input_wav}") |
|
|
|
|
|
if not os.path.exists(input_wav): |
|
raise gr.Error("檔案不存在,請重新上傳") |
|
if os.path.getsize(input_wav) > 50 * 1024 * 1024: |
|
raise gr.Error("檔案大小超過50MB限制") |
|
|
|
|
|
logger.info(f"[{process_id}] 讀取音訊檔案...") |
|
data, sr = librosa.load(input_wav, sr=None, mono=True) |
|
|
|
|
|
if sr != 16000: |
|
logger.info(f"[{process_id}] 重採樣從 {sr}Hz 到 16000Hz...") |
|
data = librosa.resample(data, orig_sr=sr, target_sr=16000) |
|
sr = 16000 |
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: |
|
temp_wav = tmp_file.name |
|
logger.info(f"[{process_id}] 寫入臨時檔案: {temp_wav}") |
|
sf.write(temp_wav, data, sr, subtype='PCM_16') |
|
|
|
|
|
logger.info(f"[{process_id}] 開始語音分離...") |
|
out_dir = tempfile.mkdtemp() |
|
outfilename = os.path.join(out_dir, "output.wav") |
|
|
|
dpt_sep_process(temp_wav, model=model, outfilename=outfilename) |
|
|
|
|
|
output_files = [ |
|
outfilename.replace('.wav', '_sep1.wav'), |
|
outfilename.replace('.wav', '_sep2.wav') |
|
] |
|
logger.info(f"[{process_id}] 預期輸出檔案: {output_files}") |
|
|
|
|
|
if not all(os.path.exists(f) for f in output_files): |
|
missing = [f for f in output_files if not os.path.exists(f)] |
|
raise gr.Error(f"分離失敗,缺失檔案: {missing}") |
|
|
|
logger.info(f"[{process_id}] 處理完成") |
|
return output_files |
|
|
|
except Exception as e: |
|
error_msg = f"[{process_id}] 處理錯誤: {str(e)}\n{traceback.format_exc()}" |
|
logger.error(error_msg) |
|
raise gr.Error(f"處理失敗: {str(e)}") from e |
|
|
|
finally: |
|
|
|
if temp_wav and os.path.exists(temp_wav): |
|
try: |
|
os.remove(temp_wav) |
|
logger.info(f"[{process_id}] 已清理臨時檔案") |
|
except Exception as clean_err: |
|
logger.warning(f"[{process_id}] 清理失敗: {str(clean_err)}") |
|
|
|
|
|
description_html = """ |
|
<h1 align='center'><a href='https://www.twman.org/AI/ASR/SpeechSeparation' target='_blank'>中文語者分離(分割)</a></h1> |
|
<p align='center'><b>上傳一段混音音檔 (支援 `.mp3`, `.wav`),自動分離出兩個人的聲音</b></p> |
|
|
|
<div align='center'> |
|
<a href='https://www.twman.org' target='_blank'>TonTon Huang Ph.D.</a> | |
|
<a href='https://www.twman.org/AI' target='_blank'> AI </a> | |
|
<a href='https://blog.twman.org/p/deeplearning101.html' target='_blank'>手把手帶你一起踩AI坑</a> | |
|
<a href='https://github.com/Deep-Learning-101' target='_blank'>GitHub</a> | |
|
<a href='http://deeplearning101.twman.org' target='_blank'>Deep Learning 101</a> | |
|
<a href='https://www.youtube.com/c/DeepLearning101' target='_blank'>YouTube</a> |
|
</div> |
|
|
|
<br> |
|
|
|
### 📘 相關技術文章: |
|
<ul> |
|
<li><a href='https://blog.twman.org/2025/03/AIAgent.html' target='_blank'>避開 AI Agent 開發陷阱:常見問題、挑戰與解決方案 (那些 AI Agent 實戰踩過的坑)</a>:探討多種 AI Agent 工具的應用經驗與挑戰</li> |
|
<li><a href='https://blog.twman.org/2024/08/LLM.html' target='_blank'>白話文手把手帶你科普 GenAI</a>:淺顯介紹生成式人工智慧核心概念</li> |
|
<li><a href='https://blog.twman.org/2024/09/LLM.html' target='_blank'>大型語言模型直接就打完收工?</a>:回顧 LLM 領域探索歷程</li> |
|
<li><a href='https://blog.twman.org/2024/07/RAG.html' target='_blank'>檢索增強生成 (Retrieval-Augmented Generation, RAG) 不是萬靈丹之優化挑戰技巧</a>:探討 RAG 技術應用與挑戰</li> |
|
<li><a href='https://blog.twman.org/2024/02/LLM.html' target='_blank'>大型語言模型 (LLM) 入門完整指南:原理、應用與未來</a>:探討多種 LLM 工具的應用與挑戰</li> |
|
<li><a href='https://blog.twman.org/2023/04/GPT.html' target='_blank'>什麼是大語言模型,它是什麼?想要嗎?(Large Language Model,LLM)</a>:探討 LLM 的發展與應用</li> |
|
<li><a href='https://blog.twman.org/2024/11/diffusion.html' target='_blank'>ComfyUI + Stable Diffuision</a>:深入探討影像生成與分割技術的應用</li> |
|
<li><a href='https://blog.twman.org/2024/02/asr-tts.html' target='_blank'>ASR/TTS 開發避坑指南:語音辨識與合成的常見挑戰與對策</a>:探討 ASR 和 TTS 技術應用中的問題</li> |
|
<li><a href='https://blog.twman.org/2021/04/NLP.html' target='_blank'>那些自然語言處理 (NLP) 踩的坑</a>:分享 NLP 領域的實踐經驗</li> |
|
<li><a href='https://blog.twman.org/2021/04/ASR.html' target='_blank'>那些語音處理 (Speech Processing) 踩的坑</a>:分享語音處理領域的實務經驗</li> |
|
<li><a href='https://blog.twman.org/2023/07/wsl.html' target='_blank'>用PPOCRLabel來幫PaddleOCR做OCR的微調和標註</a></li> |
|
<li><a href='https://blog.twman.org/2023/07/HugIE.html' target='_blank'>基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析</a></li> |
|
<li><a href='https://github.com/shibing624/pycorrector' target='_blank'>Masked Language Model (MLM) as correction BERT</a></li> |
|
</ul> |
|
|
|
<br> |
|
""" |
|
|
|
if __name__ == "__main__": |
|
|
|
interface = gr.Interface( |
|
|
|
) |
|
|
|
|
|
interface.queue() |
|
|
|
|
|
launch_kwargs = { |
|
"server_name": "0.0.0.0", |
|
"server_port": 7860, |
|
"share": False, |
|
"debug": False, |
|
"max_threads": 2, |
|
|
|
} |
|
|
|
|
|
interface.launch(**launch_kwargs) |