import gradio as gr import torch import os import soundfile as sf import librosa import logging import tempfile import traceback from datetime import datetime from DPTNet_eval.DPTNet_quant_sep import load_dpt_model, dpt_sep_process # 配置日志系统 logging.basicConfig( filename='app.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # 全局模型加载（避免重复加载） try: logger.info("開始加載語音分離模型...") model = load_dpt_model() logger.info("模型加載成功") except Exception as e: logger.error(f"模型加載失敗: {str(e)}") raise RuntimeError("模型初始化失敗") from e def separate_audio(input_wav): """處理音訊分離的主函數""" process_id = datetime.now().strftime("%Y%m%d%H%M%S%f") temp_wav = None try: logger.info(f"[{process_id}] 開始處理檔案: {input_wav}") # 1. 驗證輸入檔案 if not os.path.exists(input_wav): raise gr.Error("檔案不存在，請重新上傳") if os.path.getsize(input_wav) > 50 * 1024 * 1024: # 50MB限制 raise gr.Error("檔案大小超過50MB限制") # 2. 讀取並標準化音訊 logger.info(f"[{process_id}] 讀取音訊檔案...") data, sr = librosa.load(input_wav, sr=None, mono=True) # 3. 重採樣處理 if sr != 16000: logger.info(f"[{process_id}] 重採樣從 {sr}Hz 到 16000Hz...") data = librosa.resample(data, orig_sr=sr, target_sr=16000) sr = 16000 # 4. 創建臨時檔案 with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: temp_wav = tmp_file.name logger.info(f"[{process_id}] 寫入臨時檔案: {temp_wav}") sf.write(temp_wav, data, sr, subtype='PCM_16') # 5. 執行語音分離 logger.info(f"[{process_id}] 開始語音分離...") out_dir = tempfile.mkdtemp() # 使用臨時目錄存放輸出 outfilename = os.path.join(out_dir, "output.wav") dpt_sep_process(temp_wav, model=model, outfilename=outfilename) # 6. 獲取輸出檔案 output_files = [ outfilename.replace('.wav', '_sep1.wav'), outfilename.replace('.wav', '_sep2.wav') ] logger.info(f"[{process_id}] 預期輸出檔案: {output_files}") # 7. 驗證輸出 if not all(os.path.exists(f) for f in output_files): missing = [f for f in output_files if not os.path.exists(f)] raise gr.Error(f"分離失敗，缺失檔案: {missing}") logger.info(f"[{process_id}] 處理完成") return output_files except Exception as e: error_msg = f"[{process_id}] 處理錯誤: {str(e)}\n{traceback.format_exc()}" logger.error(error_msg) raise gr.Error(f"處理失敗: {str(e)}") from e finally: # 清理臨時檔案 if temp_wav and os.path.exists(temp_wav): try: os.remove(temp_wav) logger.info(f"[{process_id}] 已清理臨時檔案") except Exception as clean_err: logger.warning(f"[{process_id}] 清理失敗: {str(clean_err)}") # 🎯 你提供的 description 內容（已轉為 HTML） description_html = """

中文語者分離(分割)

上傳一段混音音檔（支援 `.mp3`, `.wav`），自動分離出兩個人的聲音

### 📘 相關技術文章：

避開 AI Agent 開發陷阱：常見問題、挑戰與解決方案 (那些 AI Agent 實戰踩過的坑)：探討多種 AI Agent 工具的應用經驗與挑戰
白話文手把手帶你科普 GenAI：淺顯介紹生成式人工智慧核心概念
大型語言模型直接就打完收工？：回顧 LLM 領域探索歷程
檢索增強生成 (Retrieval-Augmented Generation, RAG) 不是萬靈丹之優化挑戰技巧：探討 RAG 技術應用與挑戰
大型語言模型 (LLM) 入門完整指南：原理、應用與未來：探討多種 LLM 工具的應用與挑戰
什麼是大語言模型，它是什麼？想要嗎？(Large Language Model，LLM)：探討 LLM 的發展與應用
ComfyUI + Stable Diffuision：深入探討影像生成與分割技術的應用
ASR/TTS 開發避坑指南：語音辨識與合成的常見挑戰與對策：探討 ASR 和 TTS 技術應用中的問題
那些自然語言處理 (NLP) 踩的坑：分享 NLP 領域的實踐經驗
那些語音處理 (Speech Processing) 踩的坑：分享語音處理領域的實務經驗
用PPOCRLabel來幫PaddleOCR做OCR的微調和標註
基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析
Masked Language Model (MLM) as correction BERT

""" if __name__ == "__main__": # 完整配置 Gradio 接口 interface = gr.Interface( fn=separate_audio, inputs=gr.Audio( type="filepath", label="請上傳混音音檔 (支援格式: mp3/wav/ogg)", sources=["upload", "microphone"], max_length=180 ), outputs=[ gr.Audio(label="語音軌道 1", format="wav"), gr.Audio(label="語音軌道 2", format="wav") ], title="🎙️ 語音分離 Demo - Deep Learning 101", description=description_html, # 直接使用HTML描述 flagging_mode="never", allow_flagging="never", allow_screenshot=False, live=True, examples=[ ["examples/sample1.wav"], ["examples/sample2.mp3"] ], theme="default" ) interface.queue(concurrency_count=2) launch_kwargs = { "server_name": "0.0.0.0", "server_port": 7860, "share": False, "debug": False, "auth": None, "inbrowser": True, "quiet": False, "prevent_thread_lock": True } interface.launch(**launch_kwargs)