GavinHuang commited on
Commit
f374409
·
1 Parent(s): 2b5f9bc

fix: improve audio processing in transcribe function with longer chunk duration and normalization

Browse files
Files changed (1) hide show
  1. app.py +24 -23
app.py CHANGED
@@ -54,7 +54,6 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
54
  return state, state, audio_buffer, last_processed_time
55
 
56
  print(f"Received audio input of type: {type(audio)}")
57
-
58
  if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[1], np.ndarray):
59
  sample_rate, audio_data = audio
60
  print(f"Sample rate: {sample_rate}, Audio shape: {audio_data.shape}")
@@ -67,15 +66,16 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
67
  total_duration = total_samples / sample_rate
68
  print(f"Total buffered duration: {total_duration:.2f}s")
69
 
70
- # Process 3-second chunks with 1-second step size (2-second overlap)
71
- chunk_duration = 3.0 # seconds
72
- step_size = 1.0 # seconds
73
- min_samples = int(chunk_duration * 16000) # 3s at 16kHz
74
-
 
75
  if total_duration < chunk_duration:
76
  print(f"Buffering audio, total duration: {total_duration:.2f}s")
77
  return state, state, audio_buffer, last_processed_time
78
-
79
  try:
80
  # Concatenate buffered chunks
81
  full_audio = np.concatenate(audio_buffer)
@@ -88,7 +88,12 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
88
  else:
89
  full_audio = full_audio.astype(float)
90
 
91
- # Process 3-second chunks
 
 
 
 
 
92
  new_state = state
93
  current_time = last_processed_time
94
  total_samples_16k = len(full_audio)
@@ -107,6 +112,7 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
107
  sf.write(temp_file, chunk, samplerate=16000)
108
 
109
  # Transcribe
 
110
  hypothesis = model.transcribe([temp_file])[0]
111
  transcription = hypothesis.text
112
  print(f"Transcription: {transcription}")
@@ -181,10 +187,14 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
181
  label="Select ASR Model"
182
  )
183
  with gr.Column(scale=1):
184
- load_button = gr.Button("Load Selected Model")
185
 
186
  # Status indicator for model loading
187
- model_status = gr.Textbox(value=f"Current model: {current_model_name}", label="Model Status")
 
 
 
 
188
 
189
  # Create tabs for real-time and file-based transcription
190
  with gr.Tabs():
@@ -199,7 +209,7 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
199
  label="Speak into your microphone"
200
  )
201
 
202
- clear_btn = gr.Button("Clear Transcript")
203
 
204
  with gr.Column(scale=3):
205
  text_output = gr.Textbox(
@@ -212,7 +222,7 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
212
  placeholder="Real-time results will appear here...",
213
  lines=2
214
  )
215
- # File-based transcription tab
216
  with gr.TabItem("File Transcription"):
217
  with gr.Row():
218
  with gr.Column(scale=2):
@@ -258,7 +268,8 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
258
  inputs=[model_dropdown],
259
  outputs=[model_status, audio_buffer, last_processed_time]
260
  )
261
- # Handle the audio stream for real-time transcription
 
262
  audio_input.stream(
263
  fn=transcribe,
264
  inputs=[audio_input, model_dropdown, state, audio_buffer, last_processed_time],
@@ -272,16 +283,6 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
272
  outputs=[file_transcription]
273
  )
274
 
275
- # Clear the transcription
276
- def clear_transcription():
277
- return "", "", None, 0
278
-
279
- clear_btn.click(
280
- fn=clear_transcription,
281
- inputs=[],
282
- outputs=[text_output, streaming_text, audio_buffer, last_processed_time]
283
- )
284
-
285
  # Update the main text output when the state changes
286
  state.change(
287
  fn=lambda s: s,
 
54
  return state, state, audio_buffer, last_processed_time
55
 
56
  print(f"Received audio input of type: {type(audio)}")
 
57
  if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[1], np.ndarray):
58
  sample_rate, audio_data = audio
59
  print(f"Sample rate: {sample_rate}, Audio shape: {audio_data.shape}")
 
66
  total_duration = total_samples / sample_rate
67
  print(f"Total buffered duration: {total_duration:.2f}s")
68
 
69
+ # Process 5-second chunks with 2-second step size (3-second overlap)
70
+ # Using longer chunks usually helps with transcription accuracy
71
+ chunk_duration = 5.0 # seconds (increased from 2.0)
72
+ step_size = 2.0 # seconds (increased from 1.0)
73
+ # min_samples = int(chunk_duration * 16000) # 5s at 16kHz
74
+
75
  if total_duration < chunk_duration:
76
  print(f"Buffering audio, total duration: {total_duration:.2f}s")
77
  return state, state, audio_buffer, last_processed_time
78
+
79
  try:
80
  # Concatenate buffered chunks
81
  full_audio = np.concatenate(audio_buffer)
 
88
  else:
89
  full_audio = full_audio.astype(float)
90
 
91
+ # Normalize audio (helps with consistent volume levels)
92
+ if np.abs(full_audio).max() > 0:
93
+ full_audio = full_audio / np.abs(full_audio).max() * 0.9
94
+ print("Audio normalized to improve transcription")
95
+
96
+ # Process chunks
97
  new_state = state
98
  current_time = last_processed_time
99
  total_samples_16k = len(full_audio)
 
112
  sf.write(temp_file, chunk, samplerate=16000)
113
 
114
  # Transcribe
115
+ print(f"Transcribing chunk of duration {chunk_duration}s...")
116
  hypothesis = model.transcribe([temp_file])[0]
117
  transcription = hypothesis.text
118
  print(f"Transcription: {transcription}")
 
187
  label="Select ASR Model"
188
  )
189
  with gr.Column(scale=1):
190
+ load_button = gr.Button("Load Selected Model", elem_id="load-button", elem_classes=["btn-blue"])
191
 
192
  # Status indicator for model loading
193
+ model_status = gr.Textbox(
194
+ value=f"Current model: {current_model_name}",
195
+ label="Model Status",
196
+ container=False
197
+ )
198
 
199
  # Create tabs for real-time and file-based transcription
200
  with gr.Tabs():
 
209
  label="Speak into your microphone"
210
  )
211
 
212
+ # clear_btn = gr.Button("Clear Transcript")
213
 
214
  with gr.Column(scale=3):
215
  text_output = gr.Textbox(
 
222
  placeholder="Real-time results will appear here...",
223
  lines=2
224
  )
225
+ # File-based transcription tab
226
  with gr.TabItem("File Transcription"):
227
  with gr.Row():
228
  with gr.Column(scale=2):
 
268
  inputs=[model_dropdown],
269
  outputs=[model_status, audio_buffer, last_processed_time]
270
  )
271
+ # Handle the audio stream for real-time transcription
272
+ streaming_text = gr.State(value="")
273
  audio_input.stream(
274
  fn=transcribe,
275
  inputs=[audio_input, model_dropdown, state, audio_buffer, last_processed_time],
 
283
  outputs=[file_transcription]
284
  )
285
 
 
 
 
 
 
 
 
 
 
 
286
  # Update the main text output when the state changes
287
  state.change(
288
  fn=lambda s: s,