DeepLearning101 commited on
Commit
3cd1f60
·
verified ·
1 Parent(s): e8353f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -72
app.py CHANGED
@@ -1,29 +1,3 @@
1
- # import os
2
- # import time
3
- # import json
4
- # import gradio as gr
5
- # import torch
6
- # import torchaudio
7
- # import numpy as np
8
- # from denoiser.demucs import Demucs
9
- # from pydub import AudioSegment
10
-
11
- # modelpath = './denoiser/master64.th'
12
-
13
- # def transcribe(file_upload, microphone):
14
- # file = microphone if microphone is not None else file_upload
15
- # model = Demucs(hidden=64)
16
- # state_dict = torch.load(modelpath, map_location='cpu')
17
- # model.load_state_dict(state_dict)
18
- # demucs = model
19
- # x, sr = torchaudio.load(file)
20
- # out = demucs(x[None])[0]
21
- # out = out / max(out.abs().max().item(), 1)
22
- # torchaudio.save('enhanced.wav', out, sr)
23
- # enhanced = AudioSegment.from_wav('enhanced.wav') # 只有去完噪的需要降 bitrate 再做語音識別
24
- # enhanced.export('enhanced.wav', format="wav", bitrate="256k")
25
- # return "enhanced.wav"
26
-
27
  import os
28
  import time
29
  import json
@@ -33,67 +7,93 @@ import torchaudio
33
  import numpy as np
34
  from denoiser.demucs import Demucs
35
  from pydub import AudioSegment
36
- import soundfile as sf
37
- import librosa
38
 
39
  modelpath = './denoiser/master64.th'
40
 
41
  def transcribe(file_upload, microphone):
42
  file = microphone if microphone is not None else file_upload
43
-
44
- # 新增音訊預處理 → 統一格式
45
- def preprocess_audio(path):
46
- data, sr = sf.read(path)
47
-
48
- # 如果是雙聲道 → 轉單聲道
49
- if len(data.shape) > 1:
50
- data = data.mean(axis=1)
51
-
52
- # 如果不是 16kHz → 重採樣
53
- if sr != 16000:
54
- data = librosa.resample(data, orig_sr=sr, target_sr=16000)
55
- sr = 16000
56
-
57
- # 儲存為 WAV 供模型使用
58
- sf.write("enhanced.wav", data, sr)
59
- return "enhanced.wav"
60
-
61
- # 如果是 MP3,先轉成 WAV 再處理
62
- if file.lower().endswith(".mp3"):
63
- audio = AudioSegment.from_file(file)
64
- audio = audio.set_frame_rate(16000).set_channels(1) # 轉單聲道 + 16kHz
65
- audio.export("enhanced.wav", format="wav")
66
- file = "enhanced.wav"
67
- else:
68
- file = preprocess_audio(file)
69
-
70
  model = Demucs(hidden=64)
71
  state_dict = torch.load(modelpath, map_location='cpu')
72
  model.load_state_dict(state_dict)
73
- demucs = model.eval()
74
-
75
  x, sr = torchaudio.load(file)
76
- x = x[0:1] # 強制取第一個聲道(確保是單聲道)
 
 
 
 
 
77
 
78
- with torch.no_grad():
79
- out = demucs(x[None])[0]
 
 
 
 
 
 
 
 
 
80
 
81
- out = out / max(out.abs().max().item(), 1)
82
- torchaudio.save('enhanced_final.wav', out, sr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- # 輸出 WAV 格式給前端播放
85
- enhanced = AudioSegment.from_wav('enhanced_final.wav')
86
- enhanced.export('enhanced_final.mp3', format="mp3", bitrate="256k")
87
 
88
- return "enhanced_final.mp3" # 回傳 MP3 更省空間
89
 
90
 
91
- # 👇 加上這一行,解決 Gradio schema 推導錯誤
92
- transcribe.__annotations__ = {
93
- "file_upload": str,
94
- "microphone": str,
95
- "return": str
96
- }
97
 
98
  demo = gr.Interface(
99
  fn=transcribe,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import time
3
  import json
 
7
  import numpy as np
8
  from denoiser.demucs import Demucs
9
  from pydub import AudioSegment
 
 
10
 
11
  modelpath = './denoiser/master64.th'
12
 
13
  def transcribe(file_upload, microphone):
14
  file = microphone if microphone is not None else file_upload
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  model = Demucs(hidden=64)
16
  state_dict = torch.load(modelpath, map_location='cpu')
17
  model.load_state_dict(state_dict)
18
+ demucs = model
 
19
  x, sr = torchaudio.load(file)
20
+ out = demucs(x[None])[0]
21
+ out = out / max(out.abs().max().item(), 1)
22
+ torchaudio.save('enhanced.wav', out, sr)
23
+ enhanced = AudioSegment.from_wav('enhanced.wav') # 只有去完噪的需要降 bitrate 再做語音識別
24
+ enhanced.export('enhanced.wav', format="wav", bitrate="256k")
25
+ return "enhanced.wav"
26
 
27
+ # import os
28
+ # import time
29
+ # import json
30
+ # import gradio as gr
31
+ # import torch
32
+ # import torchaudio
33
+ # import numpy as np
34
+ # from denoiser.demucs import Demucs
35
+ # from pydub import AudioSegment
36
+ # import soundfile as sf
37
+ # import librosa
38
 
39
+ # modelpath = './denoiser/master64.th'
40
+
41
+ # def transcribe(file_upload, microphone):
42
+ # file = microphone if microphone is not None else file_upload
43
+
44
+ # # 新增音訊預處理 → 統一格式
45
+ # def preprocess_audio(path):
46
+ # data, sr = sf.read(path)
47
+
48
+ # # 如果是雙聲道 → 轉單聲道
49
+ # if len(data.shape) > 1:
50
+ # data = data.mean(axis=1)
51
+
52
+ # # 如果不是 16kHz → 重採樣
53
+ # if sr != 16000:
54
+ # data = librosa.resample(data, orig_sr=sr, target_sr=16000)
55
+ # sr = 16000
56
+
57
+ # # 儲存為 WAV 供模型使用
58
+ # sf.write("enhanced.wav", data, sr)
59
+ # return "enhanced.wav"
60
+
61
+ # # 如果是 MP3,先轉成 WAV 再處理
62
+ # if file.lower().endswith(".mp3"):
63
+ # audio = AudioSegment.from_file(file)
64
+ # audio = audio.set_frame_rate(16000).set_channels(1) # 轉單聲道 + 16kHz
65
+ # audio.export("enhanced.wav", format="wav")
66
+ # file = "enhanced.wav"
67
+ # else:
68
+ # file = preprocess_audio(file)
69
+
70
+ # model = Demucs(hidden=64)
71
+ # state_dict = torch.load(modelpath, map_location='cpu')
72
+ # model.load_state_dict(state_dict)
73
+ # demucs = model.eval()
74
+
75
+ # x, sr = torchaudio.load(file)
76
+ # x = x[0:1] # 強制取第一個聲道(確保是單聲道)
77
+
78
+ # with torch.no_grad():
79
+ # out = demucs(x[None])[0]
80
+
81
+ # out = out / max(out.abs().max().item(), 1)
82
+ # torchaudio.save('enhanced_final.wav', out, sr)
83
 
84
+ # # 輸出 WAV 格式給前端播放
85
+ # enhanced = AudioSegment.from_wav('enhanced_final.wav')
86
+ # enhanced.export('enhanced_final.mp3', format="mp3", bitrate="256k")
87
 
88
+ # return "enhanced_final.mp3" # 回傳 MP3 更省空間
89
 
90
 
91
+ # # 👇 加上這一行,解決 Gradio schema 推導錯誤
92
+ # transcribe.__annotations__ = {
93
+ # "file_upload": str,
94
+ # "microphone": str,
95
+ # "return": str
96
+ # }
97
 
98
  demo = gr.Interface(
99
  fn=transcribe,