import os import gradio as gr import torch import torchaudio import numpy as np from denoiser.demucs import Demucs from pydub import AudioSegment import soundfile as sf import librosa modelpath = './denoiser/master64.th' def transcribe(file_upload, microphone): file = microphone if microphone is not None else file_upload # 統一音訊預處理流程 def preprocess_audio(input_path, output_path): # 用 pydub 處理格式轉換 audio = AudioSegment.from_file(input_path) # 強制轉單聲道 + 16kHz 採樣率 if audio.channels > 1: audio = audio.set_channels(1) if audio.frame_rate != 16000: audio = audio.set_frame_rate(16000) # 導出為 WAV 暫存檔 audio.export(output_path, format="wav") return output_path # 處理 MP3 特殊流程 if file.lower().endswith(".mp3"): temp_wav = "temp_input.wav" preprocess_audio(file, temp_wav) file = temp_wav # 載入模型 model = Demucs(hidden=64) state_dict = torch.load(modelpath, map_location='cpu') model.load_state_dict(state_dict) model.eval() # 載入音訊並強制轉單聲道 x, sr = torchaudio.load(file) if x.shape[0] > 1: x = torch.mean(x, dim=0, keepdim=True) # 音訊長度檢查 MAX_AUDIO_SECONDS = 900 if x.shape[1] / sr > MAX_AUDIO_SECONDS: raise ValueError(f"音訊過長!限制:{MAX_AUDIO_SECONDS} 秒,當前:{x.shape[1]/sr:.1f} 秒") # 執行降噪 with torch.no_grad(): out = model(x[None])[0] # 後處理 out = out / max(out.abs().max().item(), 1) torchaudio.save('enhanced.wav', out, sr) # 轉 MP3 輸出 enhanced_mp3 = 'enhanced.mp3' AudioSegment.from_wav('enhanced.wav').export( enhanced_mp3, format="mp3", bitrate="256k" ) # 清理暫存檔 if os.path.exists("temp_input.wav"): os.remove("temp_input.wav") return enhanced_mp3 # 👇 重要:修正 Gradio 類型推導問題 transcribe.__annotations__ = { "file_upload": str, "microphone": str, "return": str } # 🎯 你提供的 description 內容(已轉為 HTML) description_html = """
上傳一段音檔 (支援 .mp3, .wav),為了提升語音識別的效果,可以在識別前先進行噪音去除