Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
7925ca5
1
Parent(s):
944e4f0
fix: add support for additional model in available models and improve audio processing logic
Browse files
app.py
CHANGED
@@ -13,7 +13,7 @@ model = None
|
|
13 |
current_model_name = "nvidia/parakeet-tdt-0.6b-v2"
|
14 |
|
15 |
# Available models
|
16 |
-
available_models = ["nvidia/parakeet-tdt-0.6b-v2"]
|
17 |
|
18 |
def load_model(model_name=None):
|
19 |
# This function will be called in the GPU worker process
|
@@ -25,9 +25,9 @@ def load_model(model_name=None):
|
|
25 |
# Check if we need to load a new model
|
26 |
if model is None or model_name != current_model_name:
|
27 |
print(f"Loading model {model_name} in worker process")
|
28 |
-
print(f"CUDA available: {torch.cuda.is_available()}")
|
29 |
-
if torch.cuda.is_available():
|
30 |
-
|
31 |
|
32 |
# Update the current model name
|
33 |
current_model_name = model_name
|
@@ -89,9 +89,9 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
|
|
89 |
full_audio = full_audio.astype(float)
|
90 |
|
91 |
# Normalize audio (helps with consistent volume levels)
|
92 |
-
if np.abs(full_audio).max() > 0:
|
93 |
-
|
94 |
-
|
95 |
|
96 |
# Process chunks
|
97 |
new_state = state
|
@@ -102,7 +102,7 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
|
|
102 |
start_sample = int(current_time * sample_rate)
|
103 |
end_sample = int((current_time + chunk_duration) * sample_rate)
|
104 |
if end_sample > total_samples_16k:
|
105 |
-
|
106 |
|
107 |
chunk = full_audio[start_sample:end_sample]
|
108 |
print(f"Processing chunk from {current_time:.2f}s to {current_time + chunk_duration:.2f}s")
|
@@ -206,7 +206,8 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
|
|
206 |
sources=["microphone"],
|
207 |
type="numpy",
|
208 |
streaming=True,
|
209 |
-
label="Speak into your microphone"
|
|
|
210 |
)
|
211 |
|
212 |
clear_btn = gr.Button("Clear Transcript", variant="secondary")
|
|
|
13 |
current_model_name = "nvidia/parakeet-tdt-0.6b-v2"
|
14 |
|
15 |
# Available models
|
16 |
+
available_models = ["nvidia/parakeet-tdt-0.6b-v2","nvidia/parakeet-tdt-1.1b"]
|
17 |
|
18 |
def load_model(model_name=None):
|
19 |
# This function will be called in the GPU worker process
|
|
|
25 |
# Check if we need to load a new model
|
26 |
if model is None or model_name != current_model_name:
|
27 |
print(f"Loading model {model_name} in worker process")
|
28 |
+
# print(f"CUDA available: {torch.cuda.is_available()}")
|
29 |
+
# if torch.cuda.is_available():
|
30 |
+
# print(f"CUDA device: {torch.cuda.get_device_name(0)}")
|
31 |
|
32 |
# Update the current model name
|
33 |
current_model_name = model_name
|
|
|
89 |
full_audio = full_audio.astype(float)
|
90 |
|
91 |
# Normalize audio (helps with consistent volume levels)
|
92 |
+
# if np.abs(full_audio).max() > 0:
|
93 |
+
# full_audio = full_audio / np.abs(full_audio).max() * 0.9
|
94 |
+
# print("Audio normalized to improve transcription")
|
95 |
|
96 |
# Process chunks
|
97 |
new_state = state
|
|
|
102 |
start_sample = int(current_time * sample_rate)
|
103 |
end_sample = int((current_time + chunk_duration) * sample_rate)
|
104 |
if end_sample > total_samples_16k:
|
105 |
+
end_sample = total_samples_16k
|
106 |
|
107 |
chunk = full_audio[start_sample:end_sample]
|
108 |
print(f"Processing chunk from {current_time:.2f}s to {current_time + chunk_duration:.2f}s")
|
|
|
206 |
sources=["microphone"],
|
207 |
type="numpy",
|
208 |
streaming=True,
|
209 |
+
label="Speak into your microphone",
|
210 |
+
samplerate=16000
|
211 |
)
|
212 |
|
213 |
clear_btn = gr.Button("Clear Transcript", variant="secondary")
|