File size: 7,187 Bytes
06fc5c8
 
00885ea
 
 
 
 
 
 
0075f67
 
00885ea
 
 
631422a
 
0075f67
 
 
631422a
 
 
 
 
 
 
 
 
 
3cd1f60
 
631422a
 
 
3cd1f60
631422a
3cd1f60
4baf7c2
3cd1f60
 
 
 
 
 
 
 
 
 
 
4baf7c2
3cd1f60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4baf7c2
3cd1f60
 
 
00885ea
3cd1f60
e8353f3
 
3cd1f60
 
 
 
 
 
e8353f3
00885ea
06fc5c8
 
2e23895
 
 
 
06fc5c8
258d287
6dbc581
e86bd31
b8f99d0
 
 
 
 
 
 
 
 
 
 
 
 
 
06fc5c8
e8353f3
 
 
 
06fc5c8
 
5d0b0e8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import os
import time
import json
import gradio as gr
import torch
import torchaudio
import numpy as np
from denoiser.demucs import Demucs
from pydub import AudioSegment

modelpath = './denoiser/master64.th'

def transcribe(file_upload, microphone):
    file = microphone if microphone is not None else file_upload
    
    # 載入模型
    model = Demucs(hidden=64)
    state_dict = torch.load(modelpath, map_location='cpu')
    model.load_state_dict(state_dict)
    
    # 載入音訊並強制轉單聲道
    x, sr = torchaudio.load(file, channels_first=True)  # 確保通道優先格式
    if x.shape[0] > 1:
        x = torch.mean(x, dim=0, keepdim=True)  # 平均所有通道轉單聲道
    
    # 執行降噪
    out = model(x[None])[0]  # 增加batch維度
    
    # 後處理
    out = out / max(out.abs().max().item(), 1)
    torchaudio.save('enhanced.wav', out, sr)
    
    # 降低位元率(僅供語音辨識使用)
    enhanced = AudioSegment.from_wav('enhanced.wav')
    enhanced.export('enhanced.wav', format="wav", bitrate="256k")
    
    return "enhanced.wav"

# import os
# import time
# import json
# import gradio as gr
# import torch
# import torchaudio
# import numpy as np
# from denoiser.demucs import Demucs
# from pydub import AudioSegment
# import soundfile as sf
# import librosa

# modelpath = './denoiser/master64.th'

# def transcribe(file_upload, microphone):
#     file = microphone if microphone is not None else file_upload

#     # 新增音訊預處理 → 統一格式
#     def preprocess_audio(path):
#         data, sr = sf.read(path)
        
#         # 如果是雙聲道 → 轉單聲道
#         if len(data.shape) > 1:
#             data = data.mean(axis=1)

#         # 如果不是 16kHz → 重採樣
#         if sr != 16000:
#             data = librosa.resample(data, orig_sr=sr, target_sr=16000)
#             sr = 16000

#         # 儲存為 WAV 供模型使用
#         sf.write("enhanced.wav", data, sr)
#         return "enhanced.wav"

#     # 如果是 MP3,先轉成 WAV 再處理
#     if file.lower().endswith(".mp3"):
#         audio = AudioSegment.from_file(file)
#         audio = audio.set_frame_rate(16000).set_channels(1)  # 轉單聲道 + 16kHz
#         audio.export("enhanced.wav", format="wav")
#         file = "enhanced.wav"
#     else:
#         file = preprocess_audio(file)

#     model = Demucs(hidden=64)
#     state_dict = torch.load(modelpath, map_location='cpu')
#     model.load_state_dict(state_dict)
#     demucs = model.eval()

#     x, sr = torchaudio.load(file)
#     x = x[0:1]  # 強制取第一個聲道(確保是單聲道)

#     with torch.no_grad():
#         out = demucs(x[None])[0]

#     out = out / max(out.abs().max().item(), 1)
#     torchaudio.save('enhanced_final.wav', out, sr)

#     # 輸出 WAV 格式給前端播放
#     enhanced = AudioSegment.from_wav('enhanced_final.wav')
#     enhanced.export('enhanced_final.mp3', format="mp3", bitrate="256k")

#     return "enhanced_final.mp3"  # 回傳 MP3 更省空間


# # 👇 加上這一行,解決 Gradio schema 推導錯誤
# transcribe.__annotations__ = {
#     "file_upload": str,
#     "microphone": str,
#     "return": str
# }

demo = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(type="filepath", label="語音質檢原始音檔", sources=["upload", "microphone"])  # 顯式指定來源
    ],
    outputs=[
        gr.Audio(type="filepath", label="Output")  # 保持列表形式
    ],
    title="<h1>語音質檢/噪音去除 (語音增強)</h1>",
    description="""<h2><a href='https://www.twman.org' target='_blank'>TonTon Huang Ph.D.</a> | <a href='https://blog.twman.org/p/deeplearning101.html' target='_blank'>手把手帶你一起踩AI坑</a><br></h2><br>
                為了提升語音識別的效果,可以在識別前先進行噪音去除<br>
                    <a href='https://github.com/Deep-Learning-101' target='_blank'>Deep Learning 101 Github</a> | <a href='http://deeplearning101.twman.org' target='_blank'>Deep Learning 101</a> | <a href='https://www.facebook.com/groups/525579498272187/' target='_blank'>台灣人工智慧社團 FB</a> | <a href='https://www.youtube.com/c/DeepLearning101' target='_blank'>YouTube</a><br>
                    <a href='https://blog.twman.org/2025/03/AIAgent.html' target='_blank'>那些 AI Agent 要踩的坑</a>:探討多種 AI 代理人工具的應用經驗與挑戰,分享實用經驗與工具推薦。<br>
                    <a href='https://blog.twman.org/2024/08/LLM.html' target='_blank'>白話文手把手帶你科普 GenAI</a>:淺顯介紹生成式人工智慧核心概念,強調硬體資源和數據的重要性。<br>
                    <a href='https://blog.twman.org/2024/09/LLM.html' target='_blank'>大型語言模型直接就打完收工?</a>:回顧 LLM 領域探索歷程,討論硬體升級對 AI 開發的重要性。<br>
                    <a href='https://blog.twman.org/2024/07/RAG.html' target='_blank'>那些檢索增強生成要踩的坑</a>:探討 RAG 技術應用與挑戰,提供實用經驗分享和工具建議。<br>
                    <a href='https://blog.twman.org/2024/02/LLM.html' target='_blank'>那些大型語言模型要踩的坑</a>:探討多種 LLM 工具的應用與挑戰,強調硬體資源的重要性。<br>
                    <a href='https://blog.twman.org/2023/04/GPT.html' target='_blank'>Large Language Model,LLM</a>:探討 LLM 的發展與應用,強調硬體資源在開發中的關鍵作用。。<br>
                    <a href='https://blog.twman.org/2024/11/diffusion.html' target='_blank'>ComfyUI + Stable Diffuision</a>:深入探討影像生成與分割技術的應用,強調硬體資源的重要性。<br>
                    <a href='https://blog.twman.org/2024/02/asr-tts.html' target='_blank'>那些ASR和TTS可能會踩的坑</a>:探討 ASR 和 TTS 技術應用中的問題,強調數據質量的重要性。<br>
                    <a href='https://blog.twman.org/2021/04/NLP.html' target='_blank'>那些自然語言處理 (Natural Language Processing, NLP) 踩的坑</a>:分享 NLP 領域的實踐經驗,強調數據質量對模型效果的影響。<br>
                    <a href='https://blog.twman.org/2021/04/ASR.html' target='_blank'>那些語音處理 (Speech Processing) 踩的坑</a>:分享語音處理領域的實務經驗,強調資料品質對模型效果的影響。<br>
                    <a href='https://blog.twman.org/2023/07/wsl.html' target='_blank'>用PPOCRLabel來幫PaddleOCR做OCR的微調和標註</a><br>
                    <a href='https://blog.twman.org/2023/07/HugIE.html' target='_blank'>基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析</a><br>                  
                <a href='https://github.com/facebookresearch/denoiser' target='_blank'> Real Time Speech Enhancement in the Waveform Domain (Interspeech 2020)</a>""",
    allow_flagging="never",
    # examples=[
    #     ["exampleAudio/15s_2020-03-27_sep1.wav"],
    #     ["exampleAudio/13s_2020-03-27_sep2.wav"],
    # ],
)

demo.launch(debug=True, share=True)