|
import os |
|
import gradio as gr |
|
import torch |
|
import torchaudio |
|
import numpy as np |
|
from denoiser.demucs import Demucs |
|
from pydub import AudioSegment |
|
import soundfile as sf |
|
import librosa |
|
|
|
modelpath = './denoiser/master64.th' |
|
|
|
def transcribe(file_upload, microphone): |
|
file = microphone if microphone is not None else file_upload |
|
|
|
|
|
def preprocess_audio(input_path, output_path): |
|
|
|
audio = AudioSegment.from_file(input_path) |
|
|
|
|
|
if audio.channels > 1: |
|
audio = audio.set_channels(1) |
|
if audio.frame_rate != 16000: |
|
audio = audio.set_frame_rate(16000) |
|
|
|
|
|
audio.export(output_path, format="wav") |
|
return output_path |
|
|
|
|
|
if file.lower().endswith(".mp3"): |
|
temp_wav = "temp_input.wav" |
|
preprocess_audio(file, temp_wav) |
|
file = temp_wav |
|
|
|
|
|
model = Demucs(hidden=64) |
|
state_dict = torch.load(modelpath, map_location='cpu') |
|
model.load_state_dict(state_dict) |
|
model.eval() |
|
|
|
|
|
x, sr = torchaudio.load(file) |
|
if x.shape[0] > 1: |
|
x = torch.mean(x, dim=0, keepdim=True) |
|
|
|
|
|
MAX_AUDIO_SECONDS = 900 |
|
if x.shape[1] / sr > MAX_AUDIO_SECONDS: |
|
raise ValueError(f"音訊過長!限制:{MAX_AUDIO_SECONDS} 秒,當前:{x.shape[1]/sr:.1f} 秒") |
|
|
|
|
|
with torch.no_grad(): |
|
out = model(x[None])[0] |
|
|
|
|
|
out = out / max(out.abs().max().item(), 1) |
|
torchaudio.save('enhanced.wav', out, sr) |
|
|
|
|
|
enhanced_mp3 = 'enhanced.mp3' |
|
AudioSegment.from_wav('enhanced.wav').export( |
|
enhanced_mp3, |
|
format="mp3", |
|
bitrate="256k" |
|
) |
|
|
|
|
|
if os.path.exists("temp_input.wav"): |
|
os.remove("temp_input.wav") |
|
|
|
return enhanced_mp3 |
|
|
|
|
|
transcribe.__annotations__ = { |
|
"file_upload": str, |
|
"microphone": str, |
|
"return": str |
|
} |
|
|
|
|
|
description_html = """ |
|
<h1 align='center'><a href='https://www.twman.org/AI/ASR/SpeechEnhancement' target='_blank'>中文語音質檢/噪音去除 (語音增強)</a></h1> |
|
<p align='center'><b>上傳一段音檔 (支援 .mp3, .wav),為了提升語音識別的效果,可以在識別前先進行噪音去除</b></p> |
|
<div align='center'> |
|
|
|
<a href='https://deep-learning-101.github.io' target='_blank'>deep-learning-101.github.io</a> | |
|
<a href='https://www.twman.org/AI' target='_blank'> AI </a> | |
|
<a href='https://www.twman.org' target='_blank'>TonTon Huang Ph.D.</a> | |
|
<a href='https://blog.twman.org/p/deeplearning101.html' target='_blank'>手把手帶你一起踩AI坑</a> |
|
<a href='https://github.com/Deep-Learning-101' target='_blank'>GitHub</a> | |
|
<a href='http://deeplearning101.twman.org' target='_blank'>Deep Learning 101</a> | |
|
<a href='https://www.youtube.com/c/DeepLearning101' target='_blank'>YouTube</a> |
|
</div> |
|
<br> |
|
📘 相關技術文章: |
|
<ul> |
|
<li><a href='https://blog.twman.org/2025/04/AI-Robot.html' target='_blank'>AI 陪伴機器人:2025 趨勢分析技術突破、市場潛力與未來展望</a> | <a href='https://blog.twman.org/2025/04/FinanceGenAI.html' target='_blank'>金融科技新浪潮:生成式 AI (GenAI) 應用場景、效益與導入挑戰</a><br></li> |
|
<li><a href='https://blog.twman.org/2025/03/AIAgent.html' target='_blank'>避開 AI Agent 開發陷阱:常見問題、挑戰與解決方案 (那些 AI Agent 實戰踩過的坑)</a>:探討多種 AI Agent 工具的應用經驗與挑戰</li> |
|
<li><a href='https://blog.twman.org/2024/08/LLM.html' target='_blank'>白話文手把手帶你科普 GenAI</a>:淺顯介紹生成式人工智慧核心概念</li> |
|
<li><a href='https://blog.twman.org/2024/09/LLM.html' target='_blank'>大型語言模型直接就打完收工?</a>:回顧 LLM 領域探索歷程</li> |
|
<li><a href='https://blog.twman.org/2024/07/RAG.html' target='_blank'>檢索增強生成不是萬靈丹:挑戰與優化技巧</a>:探討 RAG 技術應用與挑戰</li> |
|
<li><a href='https://blog.twman.org/2024/02/LLM.html' target='_blank'>大型語言模型 (LLM) 入門完整指南:原理、應用與未來 (2025 版)</a>:探討多種 LLM 工具的應用與挑戰</li> |
|
<li><a href='https://blog.twman.org/2023/04/GPT.html' target='_blank'>解析探索大型語言模型:模型發展歷史、訓練及微調技術的 VRAM 估算</a>:探討 LLM 的發展與應用</li> |
|
<li><a href='https://blog.twman.org/2024/11/diffusion.html' target='_blank'>Diffusion Model 完全解析:從原理、應用到實作 (AI 圖像生成)</a>:深入探討影像生成與分割技術的應用</li> |
|
<li><a href='https://blog.twman.org/2024/02/asr-tts.html' target='_blank'>ASR/TTS 開發避坑指南:語音辨識與合成的常見挑戰與對策</a>:探討 ASR 和 TTS 技術應用中的問題</li> |
|
<li><a href='https://blog.twman.org/2021/04/NLP.html' target='_blank'>那些自然語言處理 (NLP) 踩的坑</a>:分享 NLP 領域的實踐經驗</li> |
|
<li><a href='https://blog.twman.org/2021/04/ASR.html' target='_blank'>那些語音處理 (Speech Processing) 踩的坑</a>:分享語音處理領域的實務經驗</li> |
|
<li><a href='https://blog.twman.org/2023/07/wsl.html' target='_blank'>用PPOCRLabel來幫PaddleOCR做OCR的微調和標註</a></li> |
|
<li><a href='https://blog.twman.org/2023/07/HugIE.html' target='_blank'>基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析</a></li> |
|
</ul> |
|
<br> |
|
""" |
|
|
|
|
|
demo = gr.Interface( |
|
fn=transcribe, |
|
inputs=[ |
|
gr.Audio(type="filepath", label="上傳音訊檔案", sources=["upload", "microphone"]) |
|
], |
|
outputs=[ |
|
gr.Audio(type="filepath", label="處理後音訊") |
|
], |
|
live=True, |
|
allow_flagging="never", |
|
description=description_html |
|
) |
|
|
|
demo.launch(debug=True, share=True) |