Spaces:

DeepLearning101
/

Speech-Quality-Inspection_Meta-Denoiser

Running

App Files Files Community

DeepLearning101 commited on 20 days ago

Commit

6c96ec5

verified ·

1 Parent(s): f162d25

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -54

app.py CHANGED Viewed

@@ -1,47 +1,114 @@
 import os
-import time
-import json
 import gradio as gr
 import torch
 import torchaudio
 import numpy as np
 from denoiser.demucs import Demucs
 from pydub import AudioSegment
 modelpath = './denoiser/master64.th'
 def transcribe(file_upload, microphone):
     file = microphone if microphone is not None else file_upload
     # 載入模型
     model = Demucs(hidden=64)
     state_dict = torch.load(modelpath, map_location='cpu')
     model.load_state_dict(state_dict)
     # 載入音訊並強制轉單聲道
-    x, sr = torchaudio.load(file, channels_first=True)  # 載入音訊
-    # 新增：音訊長度檢查（插入在此處）
-    MAX_AUDIO_SECONDS = 600  # 10分鐘限制
-    if x.shape[1] / sr > MAX_AUDIO_SECONDS:
-        raise ValueError(f"音訊長度不可超過 {MAX_AUDIO_SECONDS} 秒，當前音訊長度：{x.shape[1]/sr:.1f} 秒")
-    # 單聲道轉換
     if x.shape[0] > 1:
         x = torch.mean(x, dim=0, keepdim=True)
     # 執行降噪
-    out = model(x[None])[0]
     # 後處理
     out = out / max(out.abs().max().item(), 1)
     torchaudio.save('enhanced.wav', out, sr)
-    # 降低位元率（僅供語音辨識使用）
-    enhanced = AudioSegment.from_wav('enhanced.wav')
-    enhanced.export('enhanced.wav', format="wav", bitrate="256k")
-    return "enhanced.wav"
 # import os
 # import time
@@ -113,38 +180,3 @@ def transcribe(file_upload, microphone):
 #     "microphone": str,
 #     "return": str
 # }
-demo = gr.Interface(
-    fn=transcribe,
-    inputs=[
-        gr.Audio(type="filepath", label="語音質檢原始音檔", sources=["upload", "microphone"])  # 顯式指定來源
-    ],
-    outputs=[
-        gr.Audio(type="filepath", label="Output")  # 保持列表形式
-    ],
-    live=True,
-    allow_flagging="never",
-    title="<h1>語音質檢/噪音去除 (語音增強)</h1>",
-    description="""<h2><a href='https://www.twman.org' target='_blank'>TonTon Huang Ph.D.</a> | <a href='https://blog.twman.org/p/deeplearning101.html' target='_blank'>手把手帶你一起踩AI坑</a><br></h2><br>
-                為了提升語音識別的效果，可以在識別前先進行噪音去除<br>
-                    <a href='https://github.com/Deep-Learning-101' target='_blank'>Deep Learning 101 Github</a> | <a href='http://deeplearning101.twman.org' target='_blank'>Deep Learning 101</a> | <a href='https://www.facebook.com/groups/525579498272187/' target='_blank'>台灣人工智慧社團 FB</a> | <a href='https://www.youtube.com/c/DeepLearning101' target='_blank'>YouTube</a><br>
-                    <a href='https://blog.twman.org/2025/03/AIAgent.html' target='_blank'>那些 AI Agent 要踩的坑</a>：探討多種 AI 代理人工具的應用經驗與挑戰，分享實用經驗與工具推薦。<br>
-                    <a href='https://blog.twman.org/2024/08/LLM.html' target='_blank'>白話文手把手帶你科普 GenAI</a>：淺顯介紹生成式人工智慧核心概念，強調硬體資源和數據的重要性。<br>
-                    <a href='https://blog.twman.org/2024/09/LLM.html' target='_blank'>大型語言模型直接就打完收工？</a>：回顧 LLM 領域探索歷程，討論硬體升級對 AI 開發的重要性。<br>
-                    <a href='https://blog.twman.org/2024/07/RAG.html' target='_blank'>那些檢索增強生成要踩的坑</a>：探討 RAG 技術應用與挑戰，提供實用經驗分享和工具建議。<br>
-                    <a href='https://blog.twman.org/2024/02/LLM.html' target='_blank'>那些大型語言模型要踩的坑</a>：探討多種 LLM 工具的應用與挑戰，強調硬體資源的重要性。<br>
-                    <a href='https://blog.twman.org/2023/04/GPT.html' target='_blank'>Large Language Model，LLM</a>：探討 LLM 的發展與應用，強調硬體資源在開發中的關鍵作用。。<br>
-                    <a href='https://blog.twman.org/2024/11/diffusion.html' target='_blank'>ComfyUI + Stable Diffuision</a>：深入探討影像生成與分割技術的應用，強調硬體資源的重要性。<br>
-                    <a href='https://blog.twman.org/2024/02/asr-tts.html' target='_blank'>那些ASR和TTS可能會踩的坑</a>：探討 ASR 和 TTS 技術應用中的問題，強調數據質量的重要性。<br>
-                    <a href='https://blog.twman.org/2021/04/NLP.html' target='_blank'>那些自然語言處理 (Natural Language Processing, NLP) 踩的坑</a>：分享 NLP 領域的實踐經驗，強調數據質量對模型效果的影響。<br>
-                    <a href='https://blog.twman.org/2021/04/ASR.html' target='_blank'>那些語音處理 (Speech Processing) 踩的坑</a>：分享語音處理領域的實務經驗，強調資料品質對模型效果的影響。<br>
-                    <a href='https://blog.twman.org/2023/07/wsl.html' target='_blank'>用PPOCRLabel來幫PaddleOCR做OCR的微調和標註</a><br>
-                    <a href='https://blog.twman.org/2023/07/HugIE.html' target='_blank'>基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析</a><br>
-                <a href='https://github.com/facebookresearch/denoiser' target='_blank'> Real Time Speech Enhancement in the Waveform Domain (Interspeech 2020)</a>""",
-    # examples=[
-    #     ["exampleAudio/15s_2020-03-27_sep1.wav"],
-    #     ["exampleAudio/13s_2020-03-27_sep2.wav"],
-    # ],
-)
-demo.launch(debug=True, share=True)

 import os
 import gradio as gr
 import torch
 import torchaudio
 import numpy as np
 from denoiser.demucs import Demucs
 from pydub import AudioSegment
+import soundfile as sf
+import librosa
 modelpath = './denoiser/master64.th'
 def transcribe(file_upload, microphone):
     file = microphone if microphone is not None else file_upload
+    # 統一音訊預處理流程
+    def preprocess_audio(input_path, output_path):
+        # 用 pydub 處理格式轉換
+        audio = AudioSegment.from_file(input_path)
+        # 強制轉單聲道 + 16kHz 採樣率
+        if audio.channels > 1:
+            audio = audio.set_channels(1)
+        if audio.frame_rate != 16000:
+            audio = audio.set_frame_rate(16000)
+        # 導出為 WAV 暫存檔
+        audio.export(output_path, format="wav")
+        return output_path
+    # 處理 MP3 特殊流程
+    if file.lower().endswith(".mp3"):
+        temp_wav = "temp_input.wav"
+        preprocess_audio(file, temp_wav)
+        file = temp_wav
     # 載入模型
     model = Demucs(hidden=64)
     state_dict = torch.load(modelpath, map_location='cpu')
     model.load_state_dict(state_dict)
+    model.eval()
     # 載入音訊並強制轉單聲道
+    x, sr = torchaudio.load(file)
     if x.shape[0] > 1:
         x = torch.mean(x, dim=0, keepdim=True)
+    # 音訊長度檢查
+    MAX_AUDIO_SECONDS = 600
+    if x.shape[1] / sr > MAX_AUDIO_SECONDS:
+        raise ValueError(f"音訊過長！限制：{MAX_AUDIO_SECONDS} 秒，當前：{x.shape[1]/sr:.1f} 秒")
     # 執行降噪
+    with torch.no_grad():
+        out = model(x[None])[0]
     # 後處理
     out = out / max(out.abs().max().item(), 1)
     torchaudio.save('enhanced.wav', out, sr)
+    # 轉 MP3 輸出
+    enhanced_mp3 = 'enhanced.mp3'
+    AudioSegment.from_wav('enhanced.wav').export(
+        enhanced_mp3,
+        format="mp3",
+        bitrate="256k"
+    )
+    # 清理暫存檔
+    if os.path.exists("temp_input.wav"):
+        os.remove("temp_input.wav")
+    return enhanced_mp3
+# 👇 重要：修正 Gradio 類型推導問題
+transcribe.__annotations__ = {
+    "file_upload": str,
+    "microphone": str,
+    "return": str
+}
+demo = gr.Interface(
+    fn=transcribe,
+    inputs=[
+        gr.Audio(type="filepath", label="上傳音訊檔案", sources=["upload", "microphone"])
+    ],
+    outputs=[
+        gr.Audio(type="filepath", label="處理後音訊")
+    ],
+    live=True,
+    allow_flagging="never",
+    title="<h1>語音質檢/噪音去除 (語音增強)</h1>",
+    description="""<h2><a href='https://www.twman.org' target='_blank'>TonTon Huang Ph.D.</a> | <a href='https://blog.twman.org/p/deeplearning101.html' target='_blank'>手把手帶你一起踩AI坑</a><br></h2><br>
+                為了提升語音識別的效果，可以在識別前先進行噪音去除<br>
+                    <a href='https://github.com/Deep-Learning-101' target='_blank'>Deep Learning 101 Github</a> | <a href='http://deeplearning101.twman.org' target='_blank'>Deep Learning 101</a> | <a href='https://www.facebook.com/groups/525579498272187/' target='_blank'>台灣人工智慧社團 FB</a> | <a href='https://www.youtube.com/c/DeepLearning101' target='_blank'>YouTube</a><br>
+                    <a href='https://blog.twman.org/2025/03/AIAgent.html' target='_blank'>那些 AI Agent 要踩的坑</a>：探討多種 AI 代理人工具的應用經驗與挑戰，分享實用經驗與工具推薦。<br>
+                    <a href='https://blog.twman.org/2024/08/LLM.html' target='_blank'>白話文手把手帶你科普 GenAI</a>：淺顯介紹生成式人工智慧核心概念，強調硬體資源和數據的重要性。<br>
+                    <a href='https://blog.twman.org/2024/09/LLM.html' target='_blank'>大型語言模型直接就打完收工？</a>：回顧 LLM 領域探索歷程，討論硬體升級對 AI 開發的重要性。<br>
+                    <a href='https://blog.twman.org/2024/07/RAG.html' target='_blank'>那些檢索增強生成要踩的坑</a>：探討 RAG 技術應用與挑戰，提供實用經驗分享和工具建議。<br>
+                    <a href='https://blog.twman.org/2024/02/LLM.html' target='_blank'>那些大型語言模型要踩的坑</a>：探討多種 LLM 工具的應用與挑戰，強調硬體資源的重要性。<br>
+                    <a href='https://blog.twman.org/2023/04/GPT.html' target='_blank'>Large Language Model，LLM</a>：探討 LLM 的發展與應用，強調硬體資源在開發中的關鍵作用。。<br>
+                    <a href='https://blog.twman.org/2024/11/diffusion.html' target='_blank'>ComfyUI + Stable Diffuision</a>：深入探討影像生成與分割技術的應用，強調硬體資源的重要性。<br>
+                    <a href='https://blog.twman.org/2024/02/asr-tts.html' target='_blank'>那些ASR和TTS可能會踩的坑</a>：探討 ASR 和 TTS 技術應用中的問題，強調數據質量的重要性。<br>
+                    <a href='https://blog.twman.org/2021/04/NLP.html' target='_blank'>那些自然語言處理 (Natural Language Processing, NLP) 踩的坑</a>：分享 NLP 領域的實踐經驗，強調數據質量對模型效果的影響。<br>
+                    <a href='https://blog.twman.org/2021/04/ASR.html' target='_blank'>那些語音處理 (Speech Processing) 踩的坑</a>：分享語音處理領域的實務經驗，強調資料品質對模型效果的影響。<br>
+                    <a href='https://blog.twman.org/2023/07/wsl.html' target='_blank'>用PPOCRLabel來幫PaddleOCR做OCR的微調和標註</a><br>
+                    <a href='https://blog.twman.org/2023/07/HugIE.html' target='_blank'>基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析</a><br>
+                <a href='https://github.com/facebookresearch/denoiser' target='_blank'> Real Time Speech Enhancement in the Waveform Domain (Interspeech 2020)</a>""",
+)
+demo.launch(debug=True, share=True)
 # import os
 # import time
 #     "microphone": str,
 #     "return": str
 # }