Spaces:

GavinHuang
/

asr-demo

Running on Zero

File size: 8,763 Bytes

import os
import gradio as gr
import torch
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf
import time
import spaces
import librosa

# Important: Don't initialize CUDA in the main process for Spaces
# The model will be loaded in the worker process through the GPU decorator
model = None
current_model_name = "nvidia/parakeet-tdt-0.6b-v2"

# Available models
available_models = ["nvidia/parakeet-tdt-0.6b-v2"]

def load_model(model_name=None):
    # This function will be called in the GPU worker process
    global model, current_model_name
    
    # Use the specified model name or the current one
    model_name = model_name or current_model_name
    
    # Check if we need to load a new model
    if model is None or model_name != current_model_name:
        print(f"Loading model {model_name} in worker process")
        print(f"CUDA available: {torch.cuda.is_available()}")
        if torch.cuda.is_available():
            print(f"CUDA device: {torch.cuda.get_device_name(0)}")
        
        # Update the current model name
        current_model_name = model_name
        
        # Load the selected model
        model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(model_name)
        print(f"Model loaded on device: {model.device}")
    
    return model

@spaces.GPU(duration=120)
def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_buffer=None, last_processed_time=0):
    # Load the model inside the GPU worker process
    import numpy as np
    import soundfile as sf
    import librosa
    import os
    model = load_model(model_name)
    if audio_buffer is None:
        audio_buffer = []
    
    if audio is None or isinstance(audio, int):
        print(f"Skipping invalid audio input: {type(audio)}")
        return state, state, audio_buffer, last_processed_time
    
    print(f"Received audio input of type: {type(audio)}")
    
    if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[1], np.ndarray):
        sample_rate, audio_data = audio
        print(f"Sample rate: {sample_rate}, Audio shape: {audio_data.shape}")
        
        # Append chunk to buffer
        audio_buffer.append(audio_data)
        
        # Calculate total duration in seconds
        total_samples = sum(arr.shape[0] for arr in audio_buffer)
        total_duration = total_samples / sample_rate
        print(f"Total buffered duration: {total_duration:.2f}s")
        
        # Process 3-second chunks with 1-second step size (2-second overlap)
        chunk_duration = 3.0  # seconds
        step_size = 1.0      # seconds
        min_samples = int(chunk_duration * 16000)  # 3s at 16kHz
        
        if total_duration < chunk_duration:
            print(f"Buffering audio, total duration: {total_duration:.2f}s")
            return state, state, audio_buffer, last_processed_time
        
        try:
            # Concatenate buffered chunks
            full_audio = np.concatenate(audio_buffer)
            
            # Resample to 16kHz if needed
            if sample_rate != 16000:
                print(f"Resampling from {sample_rate}Hz to 16000Hz")
                full_audio = librosa.resample(full_audio.astype(float), orig_sr=sample_rate, target_sr=16000)
                sample_rate = 16000
            else:
                full_audio = full_audio.astype(float)
            
            # Process 3-second chunks
            new_state = state
            current_time = last_processed_time
            total_samples_16k = len(full_audio)
            
            while current_time + chunk_duration <= total_duration:
                start_sample = int(current_time * sample_rate)
                end_sample = int((current_time + chunk_duration) * sample_rate)
                if end_sample > total_samples_16k:
                    break
                
                chunk = full_audio[start_sample:end_sample]
                print(f"Processing chunk from {current_time:.2f}s to {current_time + chunk_duration:.2f}s")
                
                # Save to temporary WAV file
                temp_file = "temp_audio.wav"
                sf.write(temp_file, chunk, samplerate=16000)
                
                # Transcribe
                hypothesis = model.transcribe([temp_file])[0]
                transcription = hypothesis.text
                print(f"Transcription: {transcription}")
                
                os.remove(temp_file)
                print("Temporary file removed.")
                
                # Append transcription if non-empty
                if transcription.strip():
                    new_state = new_state + " " + transcription if new_state else transcription
                
                current_time += step_size
            
            # Update last processed time
            last_processed_time = current_time
            
            # Trim buffer to keep only unprocessed audio
            keep_samples = int((total_duration - current_time) * sample_rate)
            if keep_samples > 0:
                audio_buffer = [full_audio[-keep_samples:]]
            else:
                audio_buffer = []
            
            print(f"New state: {new_state}")
            return new_state, new_state, audio_buffer, last_processed_time
        
        except Exception as e:
            print(f"Error processing audio: {e}")
            return state, state, audio_buffer, last_processed_time
    
    print(f"Invalid audio input format: {type(audio)}")
    return state, state, audio_buffer, last_processed_time

# Define the Gradio interface
with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
    gr.Markdown("# 🎙️ Real-time Speech-to-Text Transcription")
    gr.Markdown("Powered by NVIDIA NeMo")
    
    # Model selection and loading
    with gr.Row():
        with gr.Column(scale=3):
            model_dropdown = gr.Dropdown(
                choices=available_models, 
                value=current_model_name,
                label="Select ASR Model"
            )
        with gr.Column(scale=1):
            load_button = gr.Button("Load Selected Model")
    
    # Status indicator for model loading
    model_status = gr.Textbox(value=f"Current model: {current_model_name}", label="Model Status")
    
    with gr.Row():
        with gr.Column(scale=2):
            audio_input = gr.Audio(
                sources=["microphone"], 
                type="numpy", 
                streaming=True,
                label="Speak into your microphone"
            )
            
            clear_btn = gr.Button("Clear Transcript")
            
        with gr.Column(scale=3):
            text_output = gr.Textbox(
                label="Transcription", 
                placeholder="Your speech will appear here...",
                lines=10
            )
            streaming_text = gr.Textbox(
                label="Real-time Transcription", 
                placeholder="Real-time results will appear here...",
                lines=2
            )
      # State to store the ongoing transcription
    state = gr.State("")
    audio_buffer = gr.State(value=None)
    last_processed_time = gr.State(value=0)
    
    # Function to handle model selection
    def update_model(model_name):
        global current_model_name
        current_model_name = model_name
        return f"Current model: {model_name}", None, 0  # Reset audio buffer and last processed time
    
    # Load model button event
    load_button.click(
        fn=update_model,
        inputs=[model_dropdown],
        outputs=[model_status, audio_buffer, last_processed_time]
    )
    
    # Handle the audio stream
    audio_input.stream(
        fn=transcribe,
        inputs=[audio_input, model_dropdown, state, audio_buffer, last_processed_time],
        outputs=[state, streaming_text, audio_buffer, last_processed_time],
    )    # Clear the transcription
    def clear_transcription():
        return "", "", None, 0

    clear_btn.click(
        fn=clear_transcription,
        inputs=[],
        outputs=[text_output, streaming_text, audio_buffer, last_processed_time]
    )
    
    # Update the main text output when the state changes
    state.change(
        fn=lambda s: s,
        inputs=[state],
        outputs=[text_output]
    )
    gr.Markdown("## 📝 Instructions")
    gr.Markdown("""
    1. Select an ASR model from the dropdown menu
    2. Click 'Load Selected Model' to load the model
    3. Click the microphone button to start recording
    4. Speak clearly into your microphone
    5. The transcription will appear in real-time
    6. Click 'Clear Transcript' to start a new transcription
    """)

# Launch the app
if __name__ == "__main__":
    demo.launch()