DeepLearning101 commited on
Commit
4baf7c2
·
verified ·
1 Parent(s): 6dbc581

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -6
app.py CHANGED
@@ -1,3 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import time
3
  import json
@@ -7,23 +33,60 @@ import torchaudio
7
  import numpy as np
8
  from denoiser.demucs import Demucs
9
  from pydub import AudioSegment
 
 
10
 
11
  modelpath = './denoiser/master64.th'
12
 
13
  def transcribe(file_upload, microphone):
14
  file = microphone if microphone is not None else file_upload
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  model = Demucs(hidden=64)
16
  state_dict = torch.load(modelpath, map_location='cpu')
17
  model.load_state_dict(state_dict)
18
- demucs = model
 
19
  x, sr = torchaudio.load(file)
20
- out = demucs(x[None])[0]
 
 
 
 
21
  out = out / max(out.abs().max().item(), 1)
22
- torchaudio.save('enhanced.wav', out, sr)
23
- enhanced = AudioSegment.from_wav('enhanced.wav') # 只有去完噪的需要降 bitrate 再做語音識別
24
- enhanced.export('enhanced.wav', format="wav", bitrate="256k")
25
- return "enhanced.wav"
 
26
 
 
 
27
  demo = gr.Interface(
28
  fn=transcribe,
29
  inputs=[
 
1
+ # import os
2
+ # import time
3
+ # import json
4
+ # import gradio as gr
5
+ # import torch
6
+ # import torchaudio
7
+ # import numpy as np
8
+ # from denoiser.demucs import Demucs
9
+ # from pydub import AudioSegment
10
+
11
+ # modelpath = './denoiser/master64.th'
12
+
13
+ # def transcribe(file_upload, microphone):
14
+ # file = microphone if microphone is not None else file_upload
15
+ # model = Demucs(hidden=64)
16
+ # state_dict = torch.load(modelpath, map_location='cpu')
17
+ # model.load_state_dict(state_dict)
18
+ # demucs = model
19
+ # x, sr = torchaudio.load(file)
20
+ # out = demucs(x[None])[0]
21
+ # out = out / max(out.abs().max().item(), 1)
22
+ # torchaudio.save('enhanced.wav', out, sr)
23
+ # enhanced = AudioSegment.from_wav('enhanced.wav') # 只有去完噪的需要降 bitrate 再做語音識別
24
+ # enhanced.export('enhanced.wav', format="wav", bitrate="256k")
25
+ # return "enhanced.wav"
26
+
27
  import os
28
  import time
29
  import json
 
33
  import numpy as np
34
  from denoiser.demucs import Demucs
35
  from pydub import AudioSegment
36
+ import soundfile as sf
37
+ import librosa
38
 
39
  modelpath = './denoiser/master64.th'
40
 
41
  def transcribe(file_upload, microphone):
42
  file = microphone if microphone is not None else file_upload
43
+
44
+ # 新增音訊預處理 → 統一格式
45
+ def preprocess_audio(path):
46
+ data, sr = sf.read(path)
47
+
48
+ # 如果是雙聲道 → 轉單聲道
49
+ if len(data.shape) > 1:
50
+ data = data.mean(axis=1)
51
+
52
+ # 如果不是 16kHz → 重採樣
53
+ if sr != 16000:
54
+ data = librosa.resample(data, orig_sr=sr, target_sr=16000)
55
+ sr = 16000
56
+
57
+ # 儲存為 WAV 供模型使用
58
+ sf.write("enhanced.wav", data, sr)
59
+ return "enhanced.wav"
60
+
61
+ # 如果是 MP3,先轉成 WAV 再處理
62
+ if file.lower().endswith(".mp3"):
63
+ audio = AudioSegment.from_file(file)
64
+ audio = audio.set_frame_rate(16000).set_channels(1) # 轉單聲道 + 16kHz
65
+ audio.export("enhanced.wav", format="wav")
66
+ file = "enhanced.wav"
67
+ else:
68
+ file = preprocess_audio(file)
69
+
70
  model = Demucs(hidden=64)
71
  state_dict = torch.load(modelpath, map_location='cpu')
72
  model.load_state_dict(state_dict)
73
+ demucs = model.eval()
74
+
75
  x, sr = torchaudio.load(file)
76
+ x = x[0:1] # 強制取第一個聲道(確保是單聲道)
77
+
78
+ with torch.no_grad():
79
+ out = demucs(x[None])[0]
80
+
81
  out = out / max(out.abs().max().item(), 1)
82
+ torchaudio.save('enhanced_final.wav', out, sr)
83
+
84
+ # 輸出 WAV 格式給前端播放
85
+ enhanced = AudioSegment.from_wav('enhanced_final.wav')
86
+ enhanced.export('enhanced_final.mp3', format="mp3", bitrate="256k")
87
 
88
+ return "enhanced_final.mp3" # 回傳 MP3 更省空間
89
+
90
  demo = gr.Interface(
91
  fn=transcribe,
92
  inputs=[