Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,689 Bytes
18b21ee e888ead 18b21ee f334b99 e888ead 18b21ee e888ead f334b99 18b21ee f334b99 18b21ee 779d79b 18b21ee 779d79b 18b21ee 779d79b fe027e3 779d79b 18b21ee 40dfec3 18b21ee 8c34a9f 18b21ee 779d79b 18b21ee e888ead 18b21ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import os
import gradio as gr
import torch
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf
import time
import spaces
# Check if CUDA is available
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"CUDA device: {torch.cuda.get_device_name(0)}")
model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")
print(f"Model loaded on device: {model.device}")
import numpy as np
import soundfile as sf
audio_buffer = []
@spaces.GPU(duration=120)
def transcribe(audio, state=""):
global model, audio_buffer
if audio is None or isinstance(audio, int):
print(f"Skipping invalid audio input: {type(audio)}")
return state, state
# Append NumPy array to buffer
if isinstance(audio, np.ndarray):
audio_buffer.append(audio)
# Process if buffer has enough data (e.g., 5 seconds at 16kHz)
if len(np.concatenate(audio_buffer)) >= 5 * 16000:
# Concatenate and preprocess
audio_data = np.concatenate(audio_buffer)
audio_data = audio_data.mean(axis=1) if audio_data.ndim > 1 else audio_data # To mono
temp_file = "temp_audio.wav"
sf.write(temp_file, audio_data, samplerate=16000)
# Transcribe
if torch.cuda.is_available():
model = model.cuda()
transcription = model.transcribe([temp_file])[0]
model = model.cpu()
os.remove(temp_file)
# Clear buffer
audio_buffer = []
new_state = state + " " + transcription if state else transcription
return new_state, new_state
return state, state
# Define the Gradio interface
with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
gr.Markdown("# ๐๏ธ Real-time Speech-to-Text Transcription")
gr.Markdown("Powered by NVIDIA NeMo and the parakeet-tdt-0.6b-v2 model")
with gr.Row():
with gr.Column(scale=2):
audio_input = gr.Audio(
sources=["microphone"],
type="numpy",
streaming=True,
label="Speak into your microphone"
)
clear_btn = gr.Button("Clear Transcript")
with gr.Column(scale=3):
text_output = gr.Textbox(
label="Transcription",
placeholder="Your speech will appear here...",
lines=10
)
streaming_text = gr.Textbox(
label="Real-time Transcription",
placeholder="Real-time results will appear here...",
lines=2
)
# State to store the ongoing transcription
state = gr.State("")
# Handle the audio stream
audio_input.stream(
fn=transcribe,
inputs=[audio_input, state],
outputs=[state, streaming_text],
)
# Clear the transcription
def clear_transcription():
return "", "", ""
clear_btn.click(
fn=clear_transcription,
inputs=[],
outputs=[text_output, streaming_text, state]
)
# Update the main text output when the state changes
state.change(
fn=lambda s: s,
inputs=[state],
outputs=[text_output]
)
gr.Markdown("## ๐ Instructions")
gr.Markdown("""
1. Click the microphone button to start recording
2. Speak clearly into your microphone
3. The transcription will appear in real-time
4. Click 'Clear Transcript' to start a new transcription
""")
# Launch the app
if __name__ == "__main__":
demo.launch()
|