Spaces:

GavinHuang
/

asr-demo

Running on Zero

App Files Files Community

GavinHuang commited on 4 days ago

Commit

7925ca5

1 Parent(s): 944e4f0

fix: add support for additional model in available models and improve audio processing logic

Browse files

Files changed (1) hide show

app.py +10 -9

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ model = None
 current_model_name = "nvidia/parakeet-tdt-0.6b-v2"
 # Available models
-available_models = ["nvidia/parakeet-tdt-0.6b-v2"]
 def load_model(model_name=None):
     # This function will be called in the GPU worker process
@@ -25,9 +25,9 @@ def load_model(model_name=None):
     # Check if we need to load a new model
     if model is None or model_name != current_model_name:
         print(f"Loading model {model_name} in worker process")
-        print(f"CUDA available: {torch.cuda.is_available()}")
-        if torch.cuda.is_available():
-            print(f"CUDA device: {torch.cuda.get_device_name(0)}")
         # Update the current model name
         current_model_name = model_name
@@ -89,9 +89,9 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
                 full_audio = full_audio.astype(float)
             # Normalize audio (helps with consistent volume levels)
-            if np.abs(full_audio).max() > 0:
-                full_audio = full_audio / np.abs(full_audio).max() * 0.9
-                print("Audio normalized to improve transcription")
             # Process chunks
             new_state = state
@@ -102,7 +102,7 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
                 start_sample = int(current_time * sample_rate)
                 end_sample = int((current_time + chunk_duration) * sample_rate)
                 if end_sample > total_samples_16k:
-                    break
                 chunk = full_audio[start_sample:end_sample]
                 print(f"Processing chunk from {current_time:.2f}s to {current_time + chunk_duration:.2f}s")
@@ -206,7 +206,8 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
                         sources=["microphone"],
                         type="numpy",
                         streaming=True,
-                        label="Speak into your microphone"
                     )
                 clear_btn = gr.Button("Clear Transcript", variant="secondary")

 current_model_name = "nvidia/parakeet-tdt-0.6b-v2"
 # Available models
+available_models = ["nvidia/parakeet-tdt-0.6b-v2","nvidia/parakeet-tdt-1.1b"]
 def load_model(model_name=None):
     # This function will be called in the GPU worker process
     # Check if we need to load a new model
     if model is None or model_name != current_model_name:
         print(f"Loading model {model_name} in worker process")
+        # print(f"CUDA available: {torch.cuda.is_available()}")
+        # if torch.cuda.is_available():
+        #     print(f"CUDA device: {torch.cuda.get_device_name(0)}")
         # Update the current model name
         current_model_name = model_name
                 full_audio = full_audio.astype(float)
             # Normalize audio (helps with consistent volume levels)
+            # if np.abs(full_audio).max() > 0:
+            #     full_audio = full_audio / np.abs(full_audio).max() * 0.9
+            #     print("Audio normalized to improve transcription")
             # Process chunks
             new_state = state
                 start_sample = int(current_time * sample_rate)
                 end_sample = int((current_time + chunk_duration) * sample_rate)
                 if end_sample > total_samples_16k:
+                    end_sample = total_samples_16k
                 chunk = full_audio[start_sample:end_sample]
                 print(f"Processing chunk from {current_time:.2f}s to {current_time + chunk_duration:.2f}s")
                         sources=["microphone"],
                         type="numpy",
                         streaming=True,
+                        label="Speak into your microphone",
+                        samplerate=16000
                     )
                 clear_btn = gr.Button("Clear Transcript", variant="secondary")