GavinHuang commited on
Commit
2b5f9bc
Β·
1 Parent(s): 50ea265

feat: add file transcription functionality and enhance UI for model selection

Browse files
Files changed (1) hide show
  1. app.py +107 -29
app.py CHANGED
@@ -140,6 +140,33 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
140
  print(f"Invalid audio input format: {type(audio)}")
141
  return state, state, audio_buffer, last_processed_time
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  # Define the Gradio interface
144
  with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
145
  gr.Markdown("# πŸŽ™οΈ Real-time Speech-to-Text Transcription")
@@ -159,28 +186,50 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
159
  # Status indicator for model loading
160
  model_status = gr.Textbox(value=f"Current model: {current_model_name}", label="Model Status")
161
 
162
- with gr.Row():
163
- with gr.Column(scale=2):
164
- audio_input = gr.Audio(
165
- sources=["microphone"],
166
- type="numpy",
167
- streaming=True,
168
- label="Speak into your microphone"
169
- )
170
-
171
- clear_btn = gr.Button("Clear Transcript")
172
-
173
- with gr.Column(scale=3):
174
- text_output = gr.Textbox(
175
- label="Transcription",
176
- placeholder="Your speech will appear here...",
177
- lines=10
178
- )
179
- streaming_text = gr.Textbox(
180
- label="Real-time Transcription",
181
- placeholder="Real-time results will appear here...",
182
- lines=2
183
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  # State to store the ongoing transcription
185
  state = gr.State("")
186
  audio_buffer = gr.State(value=None)
@@ -188,9 +237,20 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
188
 
189
  # Function to handle model selection
190
  def update_model(model_name):
191
- global current_model_name
192
  current_model_name = model_name
193
- return f"Current model: {model_name}", None, 0 # Reset audio buffer and last processed time
 
 
 
 
 
 
 
 
 
 
 
194
 
195
  # Load model button event
196
  load_button.click(
@@ -198,16 +258,24 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
198
  inputs=[model_dropdown],
199
  outputs=[model_status, audio_buffer, last_processed_time]
200
  )
201
-
202
- # Handle the audio stream
203
  audio_input.stream(
204
  fn=transcribe,
205
  inputs=[audio_input, model_dropdown, state, audio_buffer, last_processed_time],
206
  outputs=[state, streaming_text, audio_buffer, last_processed_time],
207
- ) # Clear the transcription
 
 
 
 
 
 
 
 
 
208
  def clear_transcription():
209
- return "", "", None, 0
210
-
211
  clear_btn.click(
212
  fn=clear_transcription,
213
  inputs=[],
@@ -220,14 +288,24 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
220
  inputs=[state],
221
  outputs=[text_output]
222
  )
 
223
  gr.Markdown("## πŸ“ Instructions")
224
  gr.Markdown("""
 
225
  1. Select an ASR model from the dropdown menu
226
  2. Click 'Load Selected Model' to load the model
227
  3. Click the microphone button to start recording
228
  4. Speak clearly into your microphone
229
  5. The transcription will appear in real-time
230
  6. Click 'Clear Transcript' to start a new transcription
 
 
 
 
 
 
 
 
231
  """)
232
 
233
  # Launch the app
 
140
  print(f"Invalid audio input format: {type(audio)}")
141
  return state, state, audio_buffer, last_processed_time
142
 
143
+ @spaces.GPU(duration=120)
144
+ def transcribe_file(audio_file, model_name="nvidia/parakeet-tdt-0.6b-v2"):
145
+ # Load the model inside the GPU worker process
146
+ import numpy as np
147
+ import soundfile as sf
148
+ import librosa
149
+ import os
150
+
151
+ # Check if audio file is provided
152
+ if audio_file is None:
153
+ return "No audio file provided. Please upload an audio file."
154
+
155
+ try:
156
+ model = load_model(model_name)
157
+
158
+ print(f"Processing file: {audio_file}")
159
+
160
+ # Transcribe the entire file at once
161
+ hypothesis = model.transcribe([audio_file])[0]
162
+ transcription = hypothesis.text
163
+ print(f"File transcription: {transcription}")
164
+
165
+ return transcription
166
+ except Exception as e:
167
+ print(f"Error transcribing file: {e}")
168
+ return f"Error transcribing file: {str(e)}"
169
+
170
  # Define the Gradio interface
171
  with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
172
  gr.Markdown("# πŸŽ™οΈ Real-time Speech-to-Text Transcription")
 
186
  # Status indicator for model loading
187
  model_status = gr.Textbox(value=f"Current model: {current_model_name}", label="Model Status")
188
 
189
+ # Create tabs for real-time and file-based transcription
190
+ with gr.Tabs():
191
+ # Real-time transcription tab
192
+ with gr.TabItem("Real-time Transcription"):
193
+ with gr.Row():
194
+ with gr.Column(scale=2):
195
+ audio_input = gr.Audio(
196
+ sources=["microphone"],
197
+ type="numpy",
198
+ streaming=True,
199
+ label="Speak into your microphone"
200
+ )
201
+
202
+ clear_btn = gr.Button("Clear Transcript")
203
+
204
+ with gr.Column(scale=3):
205
+ text_output = gr.Textbox(
206
+ label="Transcription",
207
+ placeholder="Your speech will appear here...",
208
+ lines=10
209
+ )
210
+ streaming_text = gr.Textbox(
211
+ label="Real-time Transcription",
212
+ placeholder="Real-time results will appear here...",
213
+ lines=2
214
+ )
215
+ # File-based transcription tab
216
+ with gr.TabItem("File Transcription"):
217
+ with gr.Row():
218
+ with gr.Column(scale=2):
219
+ # Audio recorder that saves to file
220
+ audio_recorder = gr.Audio(
221
+ sources=["microphone"],
222
+ type="filepath",
223
+ label="Record or upload audio file"
224
+ )
225
+ transcribe_btn = gr.Button("Transcribe Audio File")
226
+
227
+ with gr.Column(scale=3):
228
+ file_transcription = gr.Textbox(
229
+ label="File Transcription",
230
+ placeholder="Transcription will appear here after clicking 'Transcribe Audio File'",
231
+ lines=10
232
+ )
233
  # State to store the ongoing transcription
234
  state = gr.State("")
235
  audio_buffer = gr.State(value=None)
 
237
 
238
  # Function to handle model selection
239
  def update_model(model_name):
240
+ global current_model_name, model
241
  current_model_name = model_name
242
+
243
+ # Load the model immediately if we're in a GPU context
244
+ try:
245
+ # This will load the model in the GPU worker
246
+ model = load_model(model_name)
247
+ status_message = f"Current model: {model_name} (loaded)"
248
+ print(f"Model {model_name} loaded successfully")
249
+ except Exception as e:
250
+ status_message = f"Current model: {model_name} (will be loaded on first use)"
251
+ print(f"Model will be loaded on first use: {e}")
252
+
253
+ return status_message, None, 0 # Reset audio buffer and last processed time
254
 
255
  # Load model button event
256
  load_button.click(
 
258
  inputs=[model_dropdown],
259
  outputs=[model_status, audio_buffer, last_processed_time]
260
  )
261
+ # Handle the audio stream for real-time transcription
 
262
  audio_input.stream(
263
  fn=transcribe,
264
  inputs=[audio_input, model_dropdown, state, audio_buffer, last_processed_time],
265
  outputs=[state, streaming_text, audio_buffer, last_processed_time],
266
+ )
267
+
268
+ # Handle file transcription
269
+ transcribe_btn.click(
270
+ fn=transcribe_file,
271
+ inputs=[audio_recorder, model_dropdown],
272
+ outputs=[file_transcription]
273
+ )
274
+
275
+ # Clear the transcription
276
  def clear_transcription():
277
+ return "", "", None, 0
278
+
279
  clear_btn.click(
280
  fn=clear_transcription,
281
  inputs=[],
 
288
  inputs=[state],
289
  outputs=[text_output]
290
  )
291
+
292
  gr.Markdown("## πŸ“ Instructions")
293
  gr.Markdown("""
294
+ ### Real-time Transcription:
295
  1. Select an ASR model from the dropdown menu
296
  2. Click 'Load Selected Model' to load the model
297
  3. Click the microphone button to start recording
298
  4. Speak clearly into your microphone
299
  5. The transcription will appear in real-time
300
  6. Click 'Clear Transcript' to start a new transcription
301
+
302
+ ### File Transcription:
303
+ 1. Select an ASR model from the dropdown menu
304
+ 2. Click 'Load Selected Model' to load the model
305
+ 3. Switch to the 'File Transcription' tab
306
+ 4. Record audio by clicking the microphone button or upload an existing audio file
307
+ 5. Click 'Transcribe Audio File' to process the recording
308
+ 6. The complete transcription will appear in the text box
309
  """)
310
 
311
  # Launch the app