Spaces:

GavinHuang
/

asr-demo

Running on Zero

App Files Files Community

GavinHuang commited on 4 days ago

Commit

2647bd6

1 Parent(s): 895c600

fix: enhance model loading and selection in transcribe function for improved user experience

Browse files

Files changed (1) hide show

app.py +62 -21

app.py CHANGED Viewed

@@ -10,28 +10,42 @@ import librosa
 # Important: Don't initialize CUDA in the main process for Spaces
 # The model will be loaded in the worker process through the GPU decorator
 model = None
-def load_model():
     # This function will be called in the GPU worker process
-    global model
-    if model is None:
-        print(f"Loading model in worker process")
         print(f"CUDA available: {torch.cuda.is_available()}")
         if torch.cuda.is_available():
             print(f"CUDA device: {torch.cuda.get_device_name(0)}")
-        model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")
         print(f"Model loaded on device: {model.device}")
     return model
 @spaces.GPU(duration=120)
-def transcribe(audio, state="", audio_buffer=None, last_processed_time=0):
     # Load the model inside the GPU worker process
     import numpy as np
     import soundfile as sf
     import librosa
     import os
-    model = load_model()
     if audio_buffer is None:
         audio_buffer = []
@@ -129,7 +143,22 @@ def transcribe(audio, state="", audio_buffer=None, last_processed_time=0):
 # Define the Gradio interface
 with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
     gr.Markdown("# 🎙️ Real-time Speech-to-Text Transcription")
-    gr.Markdown("Powered by NVIDIA NeMo and the parakeet-tdt-0.6b-v2 model")
     with gr.Row():
         with gr.Column(scale=2):
             audio_input = gr.Audio(
@@ -152,19 +181,30 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
                 placeholder="Real-time results will appear here...",
                 lines=2
             )
-    # State to store the ongoing transcription
     state = gr.State("")
     audio_buffer = gr.State(value=None)
     last_processed_time = gr.State(value=0)
     # Handle the audio stream
     audio_input.stream(
         fn=transcribe,
-        inputs=[audio_input, state, audio_buffer, last_processed_time],
         outputs=[state, streaming_text, audio_buffer, last_processed_time],
-    )
-    # Clear the transcription
     def clear_transcription():
         return "", "", None, 0
@@ -180,13 +220,14 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
         inputs=[state],
         outputs=[text_output]
     )
-    gr.Markdown("## 📝 Instructions")
     gr.Markdown("""
-    1. Click the microphone button to start recording
-    2. Speak clearly into your microphone
-    3. The transcription will appear in real-time
-    4. Click 'Clear Transcript' to start a new transcription
     """)
 # Launch the app

 # Important: Don't initialize CUDA in the main process for Spaces
 # The model will be loaded in the worker process through the GPU decorator
 model = None
+current_model_name = "nvidia/parakeet-tdt-0.6b-v2"
+# Available models
+available_models = ["nvidia/parakeet-tdt-0.6b-v2"]
+def load_model(model_name=None):
     # This function will be called in the GPU worker process
+    global model, current_model_name
+    # Use the specified model name or the current one
+    model_name = model_name or current_model_name
+    # Check if we need to load a new model
+    if model is None or model_name != current_model_name:
+        print(f"Loading model {model_name} in worker process")
         print(f"CUDA available: {torch.cuda.is_available()}")
         if torch.cuda.is_available():
             print(f"CUDA device: {torch.cuda.get_device_name(0)}")
+        # Update the current model name
+        current_model_name = model_name
+        # Load the selected model
+        model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(model_name)
         print(f"Model loaded on device: {model.device}")
     return model
 @spaces.GPU(duration=120)
+def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_buffer=None, last_processed_time=0):
     # Load the model inside the GPU worker process
     import numpy as np
     import soundfile as sf
     import librosa
     import os
+    model = load_model(model_name)
     if audio_buffer is None:
         audio_buffer = []
 # Define the Gradio interface
 with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
     gr.Markdown("# 🎙️ Real-time Speech-to-Text Transcription")
+    gr.Markdown("Powered by NVIDIA NeMo")
+    # Model selection and loading
+    with gr.Row():
+        with gr.Column(scale=3):
+            model_dropdown = gr.Dropdown(
+                choices=available_models,
+                value=current_model_name,
+                label="Select ASR Model"
+            )
+        with gr.Column(scale=1):
+            load_button = gr.Button("Load Selected Model")
+    # Status indicator for model loading
+    model_status = gr.Textbox(value=f"Current model: {current_model_name}", label="Model Status")
     with gr.Row():
         with gr.Column(scale=2):
             audio_input = gr.Audio(
                 placeholder="Real-time results will appear here...",
                 lines=2
             )
+      # State to store the ongoing transcription
     state = gr.State("")
     audio_buffer = gr.State(value=None)
     last_processed_time = gr.State(value=0)
+    # Function to handle model selection
+    def update_model(model_name):
+        global current_model_name
+        current_model_name = model_name
+        return f"Current model: {model_name}", None, 0  # Reset audio buffer and last processed time
+    # Load model button event
+    load_button.click(
+        fn=update_model,
+        inputs=[model_dropdown],
+        outputs=[model_status, audio_buffer, last_processed_time]
+    )
     # Handle the audio stream
     audio_input.stream(
         fn=transcribe,
+        inputs=[audio_input, model_dropdown, state, audio_buffer, last_processed_time],
         outputs=[state, streaming_text, audio_buffer, last_processed_time],
+    )    # Clear the transcription
     def clear_transcription():
         return "", "", None, 0
         inputs=[state],
         outputs=[text_output]
     )
+      gr.Markdown("## 📝 Instructions")
     gr.Markdown("""
+    1. Select an ASR model from the dropdown menu
+    2. Click 'Load Selected Model' to load the model
+    3. Click the microphone button to start recording
+    4. Speak clearly into your microphone
+    5. The transcription will appear in real-time
+    6. Click 'Clear Transcript' to start a new transcription
     """)
 # Launch the app