Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,242 Bytes
18b21ee e888ead 18b21ee f334b99 e888ead 18b21ee e888ead f334b99 18b21ee f334b99 18b21ee 779d79b 18b21ee 779d79b 18b21ee b6fdfee 779d79b 615813d ce364ed 615813d 779d79b 0011522 fe027e3 779d79b 0011522 779d79b 0011522 779d79b 18b21ee 40dfec3 18b21ee 8c34a9f 18b21ee 0011522 18b21ee e888ead 18b21ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import os
import gradio as gr
import torch
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf
import time
import spaces
# Check if CUDA is available
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"CUDA device: {torch.cuda.get_device_name(0)}")
model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")
print(f"Model loaded on device: {model.device}")
import numpy as np
import soundfile as sf
audio_buffer = []
@spaces.GPU(duration=120)
def transcribe(audio, state=""):
global model, audio_buffer
if audio is None or isinstance(audio, int):
print(f"Skipping invalid audio input: {type(audio)}")
return state, state
print(f"Received audio input of type: {type(audio)}")
print(f"Audio shape: {audio.shape if isinstance(audio, np.ndarray) else 'N/A'}")
# Append NumPy array to buffer
if isinstance(audio, tuple):
print(f"Tuple contents: {audio}")
# Try extracting the first element
audio = audio[1] if len(audio) > 1 else None
if not audio:
print("Empty tuple, skipping")
return state, state
if isinstance(audio, np.ndarray):
audio_buffer.append(audio)
# Process if buffer has enough data (e.g., 5 seconds at 16kHz)
if len(np.concatenate(audio_buffer)) >= 5 * 16000:
# Concatenate and preprocess
audio_data = np.concatenate(audio_buffer)
audio_data = audio_data.mean(axis=1) if audio_data.ndim > 1 else audio_data # To mono
temp_file = "temp_audio.wav"
sf.write(temp_file, audio_data, samplerate=16000)
print("Transcribing audio...")
# Transcribe
if torch.cuda.is_available():
model = model.cuda()
transcription = model.transcribe([temp_file])[0]
print(f"Transcription: {transcription}")
model = model.cpu()
os.remove(temp_file)
print("Temporary file removed.")
# Clear buffer
audio_buffer = []
new_state = state + " " + transcription if state else transcription
return new_state, new_state
return state, state
# Define the Gradio interface
with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
gr.Markdown("# ๐๏ธ Real-time Speech-to-Text Transcription")
gr.Markdown("Powered by NVIDIA NeMo and the parakeet-tdt-0.6b-v2 model")
with gr.Row():
with gr.Column(scale=2):
audio_input = gr.Audio(
sources=["microphone"],
type="numpy",
streaming=True,
label="Speak into your microphone"
)
clear_btn = gr.Button("Clear Transcript")
with gr.Column(scale=3):
text_output = gr.Textbox(
label="Transcription",
placeholder="Your speech will appear here...",
lines=10
)
streaming_text = gr.Textbox(
label="Real-time Transcription",
placeholder="Real-time results will appear here...",
lines=2
)
# State to store the ongoing transcription
state = gr.State("")
# Handle the audio stream
audio_input.stream(
fn=transcribe,
inputs=[audio_input, state],
outputs=[state, streaming_text],
)
# Clear the transcription
def clear_transcription():
return "", "", ""
clear_btn.click(
fn=clear_transcription,
inputs=[],
outputs=[text_output, streaming_text, state]
)
# Update the main text output when the state changes
state.change(
fn=lambda s: s,
inputs=[state],
outputs=[text_output]
)
gr.Markdown("## ๐ Instructions")
gr.Markdown("""
1. Click the microphone button to start recording
2. Speak clearly into your microphone
3. The transcription will appear in real-time
4. Click 'Clear Transcript' to start a new transcription
""")
# Launch the app
if __name__ == "__main__":
demo.launch()
|