Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
f374409
1
Parent(s):
2b5f9bc
fix: improve audio processing in transcribe function with longer chunk duration and normalization
Browse files
app.py
CHANGED
@@ -54,7 +54,6 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
|
|
54 |
return state, state, audio_buffer, last_processed_time
|
55 |
|
56 |
print(f"Received audio input of type: {type(audio)}")
|
57 |
-
|
58 |
if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[1], np.ndarray):
|
59 |
sample_rate, audio_data = audio
|
60 |
print(f"Sample rate: {sample_rate}, Audio shape: {audio_data.shape}")
|
@@ -67,15 +66,16 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
|
|
67 |
total_duration = total_samples / sample_rate
|
68 |
print(f"Total buffered duration: {total_duration:.2f}s")
|
69 |
|
70 |
-
# Process
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
75 |
if total_duration < chunk_duration:
|
76 |
print(f"Buffering audio, total duration: {total_duration:.2f}s")
|
77 |
return state, state, audio_buffer, last_processed_time
|
78 |
-
|
79 |
try:
|
80 |
# Concatenate buffered chunks
|
81 |
full_audio = np.concatenate(audio_buffer)
|
@@ -88,7 +88,12 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
|
|
88 |
else:
|
89 |
full_audio = full_audio.astype(float)
|
90 |
|
91 |
-
#
|
|
|
|
|
|
|
|
|
|
|
92 |
new_state = state
|
93 |
current_time = last_processed_time
|
94 |
total_samples_16k = len(full_audio)
|
@@ -107,6 +112,7 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
|
|
107 |
sf.write(temp_file, chunk, samplerate=16000)
|
108 |
|
109 |
# Transcribe
|
|
|
110 |
hypothesis = model.transcribe([temp_file])[0]
|
111 |
transcription = hypothesis.text
|
112 |
print(f"Transcription: {transcription}")
|
@@ -181,10 +187,14 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
|
|
181 |
label="Select ASR Model"
|
182 |
)
|
183 |
with gr.Column(scale=1):
|
184 |
-
load_button = gr.Button("Load Selected Model")
|
185 |
|
186 |
# Status indicator for model loading
|
187 |
-
model_status = gr.Textbox(
|
|
|
|
|
|
|
|
|
188 |
|
189 |
# Create tabs for real-time and file-based transcription
|
190 |
with gr.Tabs():
|
@@ -199,7 +209,7 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
|
|
199 |
label="Speak into your microphone"
|
200 |
)
|
201 |
|
202 |
-
clear_btn = gr.Button("Clear Transcript")
|
203 |
|
204 |
with gr.Column(scale=3):
|
205 |
text_output = gr.Textbox(
|
@@ -212,7 +222,7 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
|
|
212 |
placeholder="Real-time results will appear here...",
|
213 |
lines=2
|
214 |
)
|
215 |
-
|
216 |
with gr.TabItem("File Transcription"):
|
217 |
with gr.Row():
|
218 |
with gr.Column(scale=2):
|
@@ -258,7 +268,8 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
|
|
258 |
inputs=[model_dropdown],
|
259 |
outputs=[model_status, audio_buffer, last_processed_time]
|
260 |
)
|
261 |
-
|
|
|
262 |
audio_input.stream(
|
263 |
fn=transcribe,
|
264 |
inputs=[audio_input, model_dropdown, state, audio_buffer, last_processed_time],
|
@@ -272,16 +283,6 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
|
|
272 |
outputs=[file_transcription]
|
273 |
)
|
274 |
|
275 |
-
# Clear the transcription
|
276 |
-
def clear_transcription():
|
277 |
-
return "", "", None, 0
|
278 |
-
|
279 |
-
clear_btn.click(
|
280 |
-
fn=clear_transcription,
|
281 |
-
inputs=[],
|
282 |
-
outputs=[text_output, streaming_text, audio_buffer, last_processed_time]
|
283 |
-
)
|
284 |
-
|
285 |
# Update the main text output when the state changes
|
286 |
state.change(
|
287 |
fn=lambda s: s,
|
|
|
54 |
return state, state, audio_buffer, last_processed_time
|
55 |
|
56 |
print(f"Received audio input of type: {type(audio)}")
|
|
|
57 |
if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[1], np.ndarray):
|
58 |
sample_rate, audio_data = audio
|
59 |
print(f"Sample rate: {sample_rate}, Audio shape: {audio_data.shape}")
|
|
|
66 |
total_duration = total_samples / sample_rate
|
67 |
print(f"Total buffered duration: {total_duration:.2f}s")
|
68 |
|
69 |
+
# Process 5-second chunks with 2-second step size (3-second overlap)
|
70 |
+
# Using longer chunks usually helps with transcription accuracy
|
71 |
+
chunk_duration = 5.0 # seconds (increased from 2.0)
|
72 |
+
step_size = 2.0 # seconds (increased from 1.0)
|
73 |
+
# min_samples = int(chunk_duration * 16000) # 5s at 16kHz
|
74 |
+
|
75 |
if total_duration < chunk_duration:
|
76 |
print(f"Buffering audio, total duration: {total_duration:.2f}s")
|
77 |
return state, state, audio_buffer, last_processed_time
|
78 |
+
|
79 |
try:
|
80 |
# Concatenate buffered chunks
|
81 |
full_audio = np.concatenate(audio_buffer)
|
|
|
88 |
else:
|
89 |
full_audio = full_audio.astype(float)
|
90 |
|
91 |
+
# Normalize audio (helps with consistent volume levels)
|
92 |
+
if np.abs(full_audio).max() > 0:
|
93 |
+
full_audio = full_audio / np.abs(full_audio).max() * 0.9
|
94 |
+
print("Audio normalized to improve transcription")
|
95 |
+
|
96 |
+
# Process chunks
|
97 |
new_state = state
|
98 |
current_time = last_processed_time
|
99 |
total_samples_16k = len(full_audio)
|
|
|
112 |
sf.write(temp_file, chunk, samplerate=16000)
|
113 |
|
114 |
# Transcribe
|
115 |
+
print(f"Transcribing chunk of duration {chunk_duration}s...")
|
116 |
hypothesis = model.transcribe([temp_file])[0]
|
117 |
transcription = hypothesis.text
|
118 |
print(f"Transcription: {transcription}")
|
|
|
187 |
label="Select ASR Model"
|
188 |
)
|
189 |
with gr.Column(scale=1):
|
190 |
+
load_button = gr.Button("Load Selected Model", elem_id="load-button", elem_classes=["btn-blue"])
|
191 |
|
192 |
# Status indicator for model loading
|
193 |
+
model_status = gr.Textbox(
|
194 |
+
value=f"Current model: {current_model_name}",
|
195 |
+
label="Model Status",
|
196 |
+
container=False
|
197 |
+
)
|
198 |
|
199 |
# Create tabs for real-time and file-based transcription
|
200 |
with gr.Tabs():
|
|
|
209 |
label="Speak into your microphone"
|
210 |
)
|
211 |
|
212 |
+
# clear_btn = gr.Button("Clear Transcript")
|
213 |
|
214 |
with gr.Column(scale=3):
|
215 |
text_output = gr.Textbox(
|
|
|
222 |
placeholder="Real-time results will appear here...",
|
223 |
lines=2
|
224 |
)
|
225 |
+
# File-based transcription tab
|
226 |
with gr.TabItem("File Transcription"):
|
227 |
with gr.Row():
|
228 |
with gr.Column(scale=2):
|
|
|
268 |
inputs=[model_dropdown],
|
269 |
outputs=[model_status, audio_buffer, last_processed_time]
|
270 |
)
|
271 |
+
# Handle the audio stream for real-time transcription
|
272 |
+
streaming_text = gr.State(value="")
|
273 |
audio_input.stream(
|
274 |
fn=transcribe,
|
275 |
inputs=[audio_input, model_dropdown, state, audio_buffer, last_processed_time],
|
|
|
283 |
outputs=[file_transcription]
|
284 |
)
|
285 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
# Update the main text output when the state changes
|
287 |
state.change(
|
288 |
fn=lambda s: s,
|