Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import gradio as gr | |
import torch | |
import nemo.collections.asr as nemo_asr | |
from omegaconf import OmegaConf | |
import time | |
import spaces | |
# Check if CUDA is available | |
print(f"CUDA available: {torch.cuda.is_available()}") | |
if torch.cuda.is_available(): | |
print(f"CUDA device: {torch.cuda.get_device_name(0)}") | |
model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2") | |
print(f"Model loaded on device: {model.device}") | |
# Increase duration if inference takes >60s | |
def transcribe(audio, state=""): | |
""" | |
Transcribe audio in real-time | |
""" | |
# Skip processing if no audio is provided | |
if audio is None: | |
return state, state | |
# Move model to GPU if available | |
if torch.cuda.is_available(): | |
print(f"CUDA device: {torch.cuda.get_device_name(0)}") | |
model = model.cuda() | |
# Get the sample rate from the audio | |
sample_rate = 16000 # Default to 16kHz if not specified | |
# Process the audio with the ASR model | |
with torch.no_grad(): | |
transcription = model.transcribe([audio])[0] | |
# Append new transcription to the state | |
if state == "": | |
new_state = transcription | |
else: | |
new_state = state + " " + transcription | |
model.cpu() | |
return new_state, new_state | |
# Define the Gradio interface | |
with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo: | |
gr.Markdown("# ποΈ Real-time Speech-to-Text Transcription") | |
gr.Markdown("Powered by NVIDIA NeMo and the parakeet-tdt-0.6b-v2 model") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
audio_input = gr.Audio( | |
sources=["microphone"], | |
type="numpy", | |
streaming=True, | |
label="Speak into your microphone" | |
) | |
clear_btn = gr.Button("Clear Transcript") | |
with gr.Column(scale=3): | |
text_output = gr.Textbox( | |
label="Transcription", | |
placeholder="Your speech will appear here...", | |
lines=10 | |
) | |
streaming_text = gr.Textbox( | |
label="Real-time Transcription", | |
placeholder="Real-time results will appear here...", | |
lines=2 | |
) | |
# State to store the ongoing transcription | |
state = gr.State("") | |
# Handle the audio stream | |
audio_input.stream( | |
fn=transcribe, | |
inputs=[audio_input, state], | |
outputs=[state, streaming_text], | |
) | |
# Clear the transcription | |
def clear_transcription(): | |
return "", "", "" | |
clear_btn.click( | |
fn=clear_transcription, | |
inputs=[], | |
outputs=[text_output, streaming_text, state] | |
) | |
# Update the main text output when the state changes | |
state.change( | |
fn=lambda s: s, | |
inputs=[state], | |
outputs=[text_output] | |
) | |
gr.Markdown("## π Instructions") | |
gr.Markdown(""" | |
1. Click the microphone button to start recording | |
2. Speak clearly into your microphone | |
3. The transcription will appear in real-time | |
4. Click 'Clear Transcript' to start a new transcription | |
""") | |
# Launch the app | |
if __name__ == "__main__": | |
demo.launch() | |