Spaces:
Running
on
Zero
Running
on
Zero
File size: 13,982 Bytes
18b21ee e888ead 18b21ee f334b99 26fc80f e888ead a2ed037 2647bd6 18b21ee 2647bd6 7925ca5 2647bd6 a2ed037 2647bd6 7925ca5 2647bd6 a2ed037 2647bd6 a2ed037 18b21ee 779d79b 2647bd6 a2ed037 692769a 8505a8f 2647bd6 b407959 a2ed037 779d79b b407959 b6fdfee ac5f4c0 b407959 f374409 b407959 f374409 8505a8f b407959 c699992 b407959 f374409 7925ca5 f374409 b407959 7925ca5 b407959 f374409 b407959 8d5b897 b407959 c699992 b407959 18b21ee 2b5f9bc 6dbf680 2b5f9bc 18b21ee 2647bd6 f374409 2647bd6 f374409 2647bd6 2b5f9bc bde342d 2b5f9bc 04c6dbe 2b5f9bc 6dbf680 2b5f9bc 8d5b897 18b21ee b407959 2647bd6 2b5f9bc 2647bd6 2b5f9bc 2647bd6 f374409 18b21ee 2647bd6 b407959 2b5f9bc 6dbf680 8d5b897 6dbf680 8d5b897 6dbf680 8d5b897 2b5f9bc 18b21ee 8d5b897 18b21ee 8d5b897 6dbf680 2b5f9bc 2b1f9fe 2b5f9bc 2647bd6 6dbf680 2b5f9bc 6dbf680 18b21ee e888ead 18b21ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 |
import os
import gradio as gr
import torch
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf
import time
import spaces
import librosa
# Important: Don't initialize CUDA in the main process for Spaces
# The model will be loaded in the worker process through the GPU decorator
model = None
current_model_name = "nvidia/parakeet-tdt-0.6b-v2"
# Available models
available_models = ["nvidia/parakeet-tdt-0.6b-v2","nvidia/parakeet-tdt-1.1b"]
def load_model(model_name=None):
# This function will be called in the GPU worker process
global model, current_model_name
# Use the specified model name or the current one
model_name = model_name or current_model_name
# Check if we need to load a new model
if model is None or model_name != current_model_name:
print(f"Loading model {model_name} in worker process")
# print(f"CUDA available: {torch.cuda.is_available()}")
# if torch.cuda.is_available():
# print(f"CUDA device: {torch.cuda.get_device_name(0)}")
# Update the current model name
current_model_name = model_name
# Load the selected model
model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(model_name)
print(f"Model loaded on device: {model.device}")
return model
@spaces.GPU(duration=120)
def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_buffer=None, last_processed_time=0):
# Load the model inside the GPU worker process
import numpy as np
import soundfile as sf
import librosa
import os
model = load_model(model_name)
if audio_buffer is None:
audio_buffer = []
if audio is None or isinstance(audio, int):
print(f"Skipping invalid audio input: {type(audio)}")
return state, state, audio_buffer, last_processed_time
print(f"Received audio input of type: {type(audio)}")
if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[1], np.ndarray):
sample_rate, audio_data = audio
print(f"Sample rate: {sample_rate}, Audio shape: {audio_data.shape}")
# Append chunk to buffer
audio_buffer.append(audio_data)
# Calculate total duration in seconds
total_samples = sum(arr.shape[0] for arr in audio_buffer)
total_duration = total_samples / sample_rate
print(f"Total buffered duration: {total_duration:.2f}s")
# Process 5-second chunks with 2-second step size (3-second overlap)
# Using longer chunks usually helps with transcription accuracy
chunk_duration = 5.0 # seconds (increased from 2.0)
step_size = 2.0 # seconds (increased from 1.0)
# min_samples = int(chunk_duration * 16000) # 5s at 16kHz
if total_duration < chunk_duration:
print(f"Buffering audio, total duration: {total_duration:.2f}s")
return state, state, audio_buffer, last_processed_time
try:
# Concatenate buffered chunks
full_audio = np.concatenate(audio_buffer)
# Resample to 16kHz if needed
if sample_rate != 16000:
print(f"Resampling from {sample_rate}Hz to 16000Hz")
full_audio = librosa.resample(full_audio.astype(float), orig_sr=sample_rate, target_sr=16000)
sample_rate = 16000
else:
full_audio = full_audio.astype(float)
# Normalize audio (helps with consistent volume levels)
# if np.abs(full_audio).max() > 0:
# full_audio = full_audio / np.abs(full_audio).max() * 0.9
# print("Audio normalized to improve transcription")
# Process chunks
new_state = state
current_time = last_processed_time
total_samples_16k = len(full_audio)
while current_time + chunk_duration <= total_duration:
start_sample = int(current_time * sample_rate)
end_sample = int((current_time + chunk_duration) * sample_rate)
if end_sample > total_samples_16k:
end_sample = total_samples_16k
chunk = full_audio[start_sample:end_sample]
print(f"Processing chunk from {current_time:.2f}s to {current_time + chunk_duration:.2f}s")
# Save to temporary WAV file
temp_file = "temp_audio.wav"
sf.write(temp_file, chunk, samplerate=16000)
# Transcribe
print(f"Transcribing chunk of duration {chunk_duration}s...")
hypothesis = model.transcribe([temp_file])[0]
transcription = hypothesis.text
print(f"Transcription: {transcription}")
os.remove(temp_file)
print("Temporary file removed.")
# Append transcription if non-empty
if transcription.strip():
new_state = new_state + " " + transcription if new_state else transcription
current_time += step_size
# Update last processed time
last_processed_time = current_time
# Trim buffer to keep only unprocessed audio
keep_samples = int((total_duration - current_time) * sample_rate)
if keep_samples > 0:
audio_buffer = [full_audio[-keep_samples:]]
else:
audio_buffer = []
print(f"New state: {new_state}")
return new_state, transcription, audio_buffer, last_processed_time # Return last transcription for streaming_text
except Exception as e:
print(f"Error processing audio: {e}")
return state, state, audio_buffer, last_processed_time
print(f"Invalid audio input format: {type(audio)}")
return state, state, audio_buffer, last_processed_time
@spaces.GPU(duration=120)
def transcribe_file(audio_file, model_name="nvidia/parakeet-tdt-0.6b-v2"):
# Load the model inside the GPU worker process
import numpy as np
import soundfile as sf
import librosa
import os
# Check if audio file is provided
if audio_file is None:
return "No audio file provided. Please upload an audio file."
try:
global model
model = load_model(model_name)
print(f"Processing file: {audio_file}")
# Transcribe the entire file at once
hypothesis = model.transcribe([audio_file])[0]
transcription = hypothesis.text
print(f"File transcription: {transcription}")
return transcription
except Exception as e:
print(f"Error transcribing file: {e}")
return f"Error transcribing file: {str(e)}"
# Define the Gradio interface
with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
gr.Markdown("# ๐๏ธ Real-time Speech-to-Text Transcription")
gr.Markdown("Powered by NVIDIA NeMo")
# Model selection and loading
with gr.Row():
with gr.Column(scale=3):
model_dropdown = gr.Dropdown(
choices=available_models,
value=current_model_name,
label="Select ASR Model"
)
with gr.Column(scale=1):
load_button = gr.Button("Load Selected Model", elem_id="load-button", elem_classes=["btn-blue"])
# Status indicator for model loading
model_status = gr.Textbox(
value=f"Current model: {current_model_name}",
label="Model Status",
container=False
)
# Create tabs for real-time and file-based transcription
with gr.Tabs():
# File-based transcription tab
with gr.TabItem("File Transcription"):
with gr.Row():
with gr.Column(scale=2):
# Audio recorder that saves to file
audio_recorder = gr.Audio(
sources=["microphone"],
type="filepath",
label="Record or upload audio file"
)
with gr.Row():
transcribe_btn = gr.Button("Transcribe Audio File", variant="primary")
clear_file_btn = gr.Button("Clear Transcript", variant="secondary")
with gr.Column(scale=3):
file_transcription = gr.Textbox(
label="File Transcription",
placeholder="Transcription will appear here after clicking 'Transcribe Audio File'",
lines=10
)
# Real-time transcription tab
with gr.TabItem("Real-time Transcription"):
with gr.Row():
with gr.Column(scale=2):
audio_input = gr.Audio(
sources=["microphone"],
type="numpy",
streaming=True,
label="Speak into your microphone",
waveform_options=gr.WaveformOptions(
sample_rate=16000
)
)
clear_btn = gr.Button("Clear Transcript", variant="secondary")
with gr.Column(scale=3):
text_output = gr.Textbox(
label="Transcription",
placeholder="Your speech will appear here...",
lines=10
)
streaming_text = gr.Textbox(
label="Real-time Transcription",
placeholder="Real-time results will appear here...",
lines=2
)
# State to store the ongoing transcription
state = gr.State("")
audio_buffer = gr.State(value=None)
last_processed_time = gr.State(value=0)
# Function to handle model selection
def update_model(model_name):
global current_model_name, model
current_model_name = model_name
# Load the model immediately if we're in a GPU context
try:
# This will load the model in the GPU worker
model = load_model(model_name)
status_message = f"Current model: {model_name} (loaded)"
print(f"Model {model_name} loaded successfully")
except Exception as e:
status_message = f"Current model: {model_name} (will be loaded on first use)"
print(f"Model will be loaded on first use: {e}")
return status_message, None, 0 # Reset audio buffer and last processed time
# Load model button event
load_button.click(
fn=update_model,
inputs=[model_dropdown],
outputs=[model_status, audio_buffer, last_processed_time]
)
# Handle the audio stream for real-time transcription
streaming_text = gr.State(value="")
audio_input.stream(
fn=transcribe,
inputs=[audio_input, model_dropdown, state, audio_buffer, last_processed_time],
outputs=[state, streaming_text, audio_buffer, last_processed_time],
)
# Handle file transcription
transcribe_btn.click(
fn=transcribe_file,
inputs=[audio_recorder, model_dropdown],
outputs=[file_transcription]
) # Clear the real-time transcription
def clear_transcription():
print("Clearing real-time transcription")
return "", "", None, 0 # Return empty values for state, text_output, audio_buffer, and last_processed_time
# Clear the file transcription
def clear_file_transcription():
print("Clearing file transcription")
return "" # Clear file_transcription
# Set up clear button event handlers
clear_btn.click(
fn=clear_transcription,
inputs=[],
outputs=[state, text_output, audio_buffer, last_processed_time]
)
# Also clear streaming_text when clearing the transcription
clear_btn.click(
fn=lambda: "",
inputs=[],
outputs=[streaming_text]
)
clear_file_btn.click(
fn=clear_file_transcription,
inputs=[],
outputs=[file_transcription]
) # Update the main text output when the state changes
def update_output(transcript):
# For streaming_text, show just the last few words or chunks
words = transcript.split()
if len(words) > 15:
streaming_text = " ".join(words[-15:])
else:
streaming_text = transcript
return transcript, streaming_text
state.change(
fn=update_output,
inputs=[state],
outputs=[text_output, streaming_text]
)
gr.Markdown("## ๐ Instructions")
gr.Markdown("""
### Real-time Transcription:
1. Select an ASR model from the dropdown menu
2. Click 'Load Selected Model' to load the model
3. Click the microphone button to start recording
4. Speak clearly into your microphone
5. The transcription will appear in real-time
6. Click 'Clear Transcript' to reset the transcription
### File Transcription:
1. Select an ASR model from the dropdown menu
2. Click 'Load Selected Model' to load the model
3. Switch to the 'File Transcription' tab
4. Record audio by clicking the microphone button or upload an existing audio file
5. Click 'Transcribe Audio File' to process the recording
6. The complete transcription will appear in the text box
7. Click 'Clear Transcript' to reset the file transcription
""")
# Launch the app
if __name__ == "__main__":
demo.launch()
|