GavinHuang commited on
Commit
7925ca5
·
1 Parent(s): 944e4f0

fix: add support for additional model in available models and improve audio processing logic

Browse files
Files changed (1) hide show
  1. app.py +10 -9
app.py CHANGED
@@ -13,7 +13,7 @@ model = None
13
  current_model_name = "nvidia/parakeet-tdt-0.6b-v2"
14
 
15
  # Available models
16
- available_models = ["nvidia/parakeet-tdt-0.6b-v2"]
17
 
18
  def load_model(model_name=None):
19
  # This function will be called in the GPU worker process
@@ -25,9 +25,9 @@ def load_model(model_name=None):
25
  # Check if we need to load a new model
26
  if model is None or model_name != current_model_name:
27
  print(f"Loading model {model_name} in worker process")
28
- print(f"CUDA available: {torch.cuda.is_available()}")
29
- if torch.cuda.is_available():
30
- print(f"CUDA device: {torch.cuda.get_device_name(0)}")
31
 
32
  # Update the current model name
33
  current_model_name = model_name
@@ -89,9 +89,9 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
89
  full_audio = full_audio.astype(float)
90
 
91
  # Normalize audio (helps with consistent volume levels)
92
- if np.abs(full_audio).max() > 0:
93
- full_audio = full_audio / np.abs(full_audio).max() * 0.9
94
- print("Audio normalized to improve transcription")
95
 
96
  # Process chunks
97
  new_state = state
@@ -102,7 +102,7 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
102
  start_sample = int(current_time * sample_rate)
103
  end_sample = int((current_time + chunk_duration) * sample_rate)
104
  if end_sample > total_samples_16k:
105
- break
106
 
107
  chunk = full_audio[start_sample:end_sample]
108
  print(f"Processing chunk from {current_time:.2f}s to {current_time + chunk_duration:.2f}s")
@@ -206,7 +206,8 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
206
  sources=["microphone"],
207
  type="numpy",
208
  streaming=True,
209
- label="Speak into your microphone"
 
210
  )
211
 
212
  clear_btn = gr.Button("Clear Transcript", variant="secondary")
 
13
  current_model_name = "nvidia/parakeet-tdt-0.6b-v2"
14
 
15
  # Available models
16
+ available_models = ["nvidia/parakeet-tdt-0.6b-v2","nvidia/parakeet-tdt-1.1b"]
17
 
18
  def load_model(model_name=None):
19
  # This function will be called in the GPU worker process
 
25
  # Check if we need to load a new model
26
  if model is None or model_name != current_model_name:
27
  print(f"Loading model {model_name} in worker process")
28
+ # print(f"CUDA available: {torch.cuda.is_available()}")
29
+ # if torch.cuda.is_available():
30
+ # print(f"CUDA device: {torch.cuda.get_device_name(0)}")
31
 
32
  # Update the current model name
33
  current_model_name = model_name
 
89
  full_audio = full_audio.astype(float)
90
 
91
  # Normalize audio (helps with consistent volume levels)
92
+ # if np.abs(full_audio).max() > 0:
93
+ # full_audio = full_audio / np.abs(full_audio).max() * 0.9
94
+ # print("Audio normalized to improve transcription")
95
 
96
  # Process chunks
97
  new_state = state
 
102
  start_sample = int(current_time * sample_rate)
103
  end_sample = int((current_time + chunk_duration) * sample_rate)
104
  if end_sample > total_samples_16k:
105
+ end_sample = total_samples_16k
106
 
107
  chunk = full_audio[start_sample:end_sample]
108
  print(f"Processing chunk from {current_time:.2f}s to {current_time + chunk_duration:.2f}s")
 
206
  sources=["microphone"],
207
  type="numpy",
208
  streaming=True,
209
+ label="Speak into your microphone",
210
+ samplerate=16000
211
  )
212
 
213
  clear_btn = gr.Button("Clear Transcript", variant="secondary")