GavinHuang commited on
Commit
b407959
·
1 Parent(s): 8505a8f

fix: enhance audio processing in transcribe function with improved buffering and chunk handling

Browse files
Files changed (1) hide show
  1. app.py +88 -32
app.py CHANGED
@@ -24,50 +24,105 @@ def load_model():
24
  return model
25
 
26
  @spaces.GPU(duration=120)
27
- def transcribe(audio, state=""):
28
  # Load the model inside the GPU worker process
29
  import numpy as np
30
  import soundfile as sf
31
  import librosa
32
  import os
33
- model = load_model()
 
34
 
35
  if audio is None or isinstance(audio, int):
36
  print(f"Skipping invalid audio input: {type(audio)}")
37
- return state, state
 
38
  print(f"Received audio input of type: {type(audio)}")
39
- print(f"Audio shape: {audio.shape if isinstance(audio, np.ndarray) else 'N/A'}")
40
 
41
  if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[1], np.ndarray):
42
- # Handle tuple of (sample_rate, audio_array)
43
- print(f"Tuple contents: {audio}")
44
  sample_rate, audio_data = audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  try:
46
- # Resample to 16kHz for NeMo
 
 
 
47
  if sample_rate != 16000:
48
  print(f"Resampling from {sample_rate}Hz to 16000Hz")
49
- audio_data = librosa.resample(audio_data.astype(float), orig_sr=sample_rate, target_sr=16000)
50
- # Save to temporary WAV file
51
- temp_file = "temp_audio.wav"
52
- sf.write(temp_file, audio_data, samplerate=16000)
53
- print(f"Processing temporary audio file: {temp_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- # Transcribe and extract only the text (string)
56
- hypothesis = model.transcribe([temp_file])[0]
57
- print(f"Hypothesis: {hypothesis}")
58
- transcription = hypothesis.text # Extract the text attribute (string)
59
- print(f"Transcription: {transcription}")
60
-
61
- os.remove(temp_file) # Clean up
62
- print("Temporary file removed.")
63
  except Exception as e:
64
  print(f"Error processing audio: {e}")
65
- return state, state
66
-
67
- new_state = state + " " + transcription if state else transcription
68
- print(f"New state: {new_state}")
69
- return new_state, new_state
70
- return state, state
71
 
72
  # Define the Gradio interface
73
  with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
@@ -98,22 +153,23 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
98
 
99
  # State to store the ongoing transcription
100
  state = gr.State("")
101
-
 
102
  # Handle the audio stream
103
  audio_input.stream(
104
  fn=transcribe,
105
- inputs=[audio_input, state],
106
- outputs=[state, streaming_text],
107
  )
108
 
109
  # Clear the transcription
110
  def clear_transcription():
111
- return "", "", ""
112
-
113
  clear_btn.click(
114
  fn=clear_transcription,
115
  inputs=[],
116
- outputs=[text_output, streaming_text, state]
117
  )
118
 
119
  # Update the main text output when the state changes
 
24
  return model
25
 
26
  @spaces.GPU(duration=120)
27
+ def transcribe(audio, state="", audio_buffer=None, last_processed_time=0):
28
  # Load the model inside the GPU worker process
29
  import numpy as np
30
  import soundfile as sf
31
  import librosa
32
  import os
33
+ if audio_buffer is None:
34
+ audio_buffer = []
35
 
36
  if audio is None or isinstance(audio, int):
37
  print(f"Skipping invalid audio input: {type(audio)}")
38
+ return state, state, audio_buffer, last_processed_time
39
+
40
  print(f"Received audio input of type: {type(audio)}")
 
41
 
42
  if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[1], np.ndarray):
 
 
43
  sample_rate, audio_data = audio
44
+ print(f"Sample rate: {sample_rate}, Audio shape: {audio_data.shape}")
45
+
46
+ # Append chunk to buffer
47
+ audio_buffer.append(audio_data)
48
+
49
+ # Calculate total duration in seconds
50
+ total_samples = sum(arr.shape[0] for arr in audio_buffer)
51
+ total_duration = total_samples / sample_rate
52
+ print(f"Total buffered duration: {total_duration:.2f}s")
53
+
54
+ # Process 3-second chunks with 1-second step size (2-second overlap)
55
+ chunk_duration = 3.0 # seconds
56
+ step_size = 1.0 # seconds
57
+ min_samples = int(chunk_duration * 16000) # 3s at 16kHz
58
+
59
+ if total_duration < chunk_duration:
60
+ print(f"Buffering audio, total duration: {total_duration:.2f}s")
61
+ return state, state, audio_buffer, last_processed_time
62
+
63
  try:
64
+ # Concatenate buffered chunks
65
+ full_audio = np.concatenate(audio_buffer)
66
+
67
+ # Resample to 16kHz if needed
68
  if sample_rate != 16000:
69
  print(f"Resampling from {sample_rate}Hz to 16000Hz")
70
+ full_audio = librosa.resample(full_audio.astype(float), orig_sr=sample_rate, target_sr=16000)
71
+ sample_rate = 16000
72
+ else:
73
+ full_audio = full_audio.astype(float)
74
+
75
+ # Process 3-second chunks
76
+ new_state = state
77
+ current_time = last_processed_time
78
+ total_samples_16k = len(full_audio)
79
+
80
+ while current_time + chunk_duration <= total_duration:
81
+ start_sample = int(current_time * sample_rate)
82
+ end_sample = int((current_time + chunk_duration) * sample_rate)
83
+ if end_sample > total_samples_16k:
84
+ break
85
+
86
+ chunk = full_audio[start_sample:end_sample]
87
+ print(f"Processing chunk from {current_time:.2f}s to {current_time + chunk_duration:.2f}s")
88
+
89
+ # Save to temporary WAV file
90
+ temp_file = "temp_audio.wav"
91
+ sf.write(temp_file, chunk, samplerate=16000)
92
+
93
+ # Transcribe
94
+ hypothesis = model.transcribe([temp_file])[0]
95
+ transcription = hypothesis.text
96
+ print(f"Transcription: {transcription}")
97
+
98
+ os.remove(temp_file)
99
+ print("Temporary file removed.")
100
+
101
+ # Append transcription if non-empty
102
+ if transcription.strip():
103
+ new_state = new_state + " " + transcription if new_state else transcription
104
+
105
+ current_time += step_size
106
+
107
+ # Update last processed time
108
+ last_processed_time = current_time
109
+
110
+ # Trim buffer to keep only unprocessed audio
111
+ keep_samples = int((total_duration - current_time) * sample_rate)
112
+ if keep_samples > 0:
113
+ audio_buffer = [full_audio[-keep_samples:]]
114
+ else:
115
+ audio_buffer = []
116
 
117
+ print(f"New state: {new_state}")
118
+ return new_state, new_state, audio_buffer, last_processed_time
119
+
 
 
 
 
 
120
  except Exception as e:
121
  print(f"Error processing audio: {e}")
122
+ return state, state, audio_buffer, last_processed_time
123
+
124
+ print(f"Invalid audio input format: {type(audio)}")
125
+ return state, state, audio_buffer, last_processed_time
 
 
126
 
127
  # Define the Gradio interface
128
  with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
 
153
 
154
  # State to store the ongoing transcription
155
  state = gr.State("")
156
+ audio_buffer = gr.State(value=None)
157
+ last_processed_time = gr.State(value=0)
158
  # Handle the audio stream
159
  audio_input.stream(
160
  fn=transcribe,
161
+ inputs=[audio_input, state, audio_buffer, last_processed_time],
162
+ outputs=[state, streaming_text, audio_buffer, last_processed_time],
163
  )
164
 
165
  # Clear the transcription
166
  def clear_transcription():
167
+ return "", "", None, 0
168
+
169
  clear_btn.click(
170
  fn=clear_transcription,
171
  inputs=[],
172
+ outputs=[text_output, streaming_text, audio_buffer, last_processed_time]
173
  )
174
 
175
  # Update the main text output when the state changes