Spaces:

GavinHuang
/

asr-demo

Running on Zero

App Files Files Community

asr-demo / app.py

GavinHuang

refactor model loading and reintroduce GPU decorator for transcription function

f334b99 10 days ago

raw

history blame

3.29 kB

	import os
	import gradio as gr
	import torch
	import nemo.collections.asr as nemo_asr
	from omegaconf import OmegaConf
	import time
	import spaces

	# Check if CUDA is available
	print(f"CUDA available: {torch.cuda.is_available()}")
	if torch.cuda.is_available():
	print(f"CUDA device: {torch.cuda.get_device_name(0)}")

	model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")

	print(f"Model loaded on device: {model.device}")

	@spaces.GPU(duration=120) # Increase duration if inference takes >60s
	def transcribe(audio, state=""):
	"""
	Transcribe audio in real-time
	"""
	# Skip processing if no audio is provided
	if audio is None:
	return state, state

	# Move model to GPU if available
	if torch.cuda.is_available():
	print(f"CUDA device: {torch.cuda.get_device_name(0)}")
	model = model.cuda()

	# Get the sample rate from the audio
	sample_rate = 16000 # Default to 16kHz if not specified

	# Process the audio with the ASR model
	with torch.no_grad():
	transcription = model.transcribe([audio])[0]

	# Append new transcription to the state
	if state == "":
	new_state = transcription
	else:
	new_state = state + " " + transcription
	model.cpu()
	return new_state, new_state

	# Define the Gradio interface
	with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
	gr.Markdown("# 🎙️ Real-time Speech-to-Text Transcription")
	gr.Markdown("Powered by NVIDIA NeMo and the parakeet-tdt-0.6b-v2 model")
	with gr.Row():
	with gr.Column(scale=2):
	audio_input = gr.Audio(
	sources=["microphone"],
	type="numpy",
	streaming=True,
	label="Speak into your microphone"
	)

	clear_btn = gr.Button("Clear Transcript")

	with gr.Column(scale=3):
	text_output = gr.Textbox(
	label="Transcription",
	placeholder="Your speech will appear here...",
	lines=10
	)
	streaming_text = gr.Textbox(
	label="Real-time Transcription",
	placeholder="Real-time results will appear here...",
	lines=2
	)

	# State to store the ongoing transcription
	state = gr.State("")

	# Handle the audio stream
	audio_input.stream(
	fn=transcribe,
	inputs=[audio_input, state],
	outputs=[state, streaming_text],
	)

	# Clear the transcription
	def clear_transcription():
	return "", "", ""

	clear_btn.click(
	fn=clear_transcription,
	inputs=[],
	outputs=[text_output, streaming_text, state]
	)

	# Update the main text output when the state changes
	state.change(
	fn=lambda s: s,
	inputs=[state],
	outputs=[text_output]
	)

	gr.Markdown("## 📝 Instructions")
	gr.Markdown("""
	1. Click the microphone button to start recording
	2. Speak clearly into your microphone
	3. The transcription will appear in real-time
	4. Click 'Clear Transcript' to start a new transcription
	""")

	# Launch the app
	if __name__ == "__main__":
	demo.launch()