File size: 3,689 Bytes
18b21ee
e888ead
 
18b21ee
 
 
f334b99
e888ead
18b21ee
 
 
 
e888ead
f334b99
18b21ee
f334b99
18b21ee
779d79b
 
 
 
 
18b21ee
779d79b
 
 
18b21ee
 
779d79b
 
 
 
 
 
 
 
 
 
fe027e3
779d79b
 
 
 
 
 
 
 
 
 
 
 
18b21ee
 
 
 
 
40dfec3
18b21ee
 
8c34a9f
18b21ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
779d79b
18b21ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e888ead
18b21ee
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import gradio as gr
import torch
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf
import time
import spaces

# Check if CUDA is available
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")

print(f"Model loaded on device: {model.device}")

import numpy as np
import soundfile as sf
audio_buffer = []

@spaces.GPU(duration=120)
def transcribe(audio, state=""):
    global model, audio_buffer
    if audio is None or isinstance(audio, int):
        print(f"Skipping invalid audio input: {type(audio)}")
        return state, state

    # Append NumPy array to buffer
    if isinstance(audio, np.ndarray):
        audio_buffer.append(audio)
        # Process if buffer has enough data (e.g., 5 seconds at 16kHz)
        if len(np.concatenate(audio_buffer)) >= 5 * 16000:
            # Concatenate and preprocess
            audio_data = np.concatenate(audio_buffer)
            audio_data = audio_data.mean(axis=1) if audio_data.ndim > 1 else audio_data  # To mono
            temp_file = "temp_audio.wav"
            sf.write(temp_file, audio_data, samplerate=16000)

            # Transcribe
            if torch.cuda.is_available():
                model = model.cuda()
            transcription = model.transcribe([temp_file])[0]
            model = model.cpu()
            os.remove(temp_file)

            # Clear buffer
            audio_buffer = []
            new_state = state + " " + transcription if state else transcription
            return new_state, new_state
    return state, state

# Define the Gradio interface
with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
    gr.Markdown("# ๐ŸŽ™๏ธ Real-time Speech-to-Text Transcription")
    gr.Markdown("Powered by NVIDIA NeMo and the parakeet-tdt-0.6b-v2 model")
    with gr.Row():
        with gr.Column(scale=2):
            audio_input = gr.Audio(
                sources=["microphone"], 
                type="numpy", 
                streaming=True,
                label="Speak into your microphone"
            )
            
            clear_btn = gr.Button("Clear Transcript")
            
        with gr.Column(scale=3):
            text_output = gr.Textbox(
                label="Transcription", 
                placeholder="Your speech will appear here...",
                lines=10
            )
            streaming_text = gr.Textbox(
                label="Real-time Transcription", 
                placeholder="Real-time results will appear here...",
                lines=2
            )
    
    # State to store the ongoing transcription
    state = gr.State("")
    
    # Handle the audio stream
    audio_input.stream(
        fn=transcribe,
        inputs=[audio_input, state],
        outputs=[state, streaming_text],
    )
        
    # Clear the transcription
    def clear_transcription():
        return "", "", ""
    
    clear_btn.click(
        fn=clear_transcription,
        inputs=[],
        outputs=[text_output, streaming_text, state]
    )
    
    # Update the main text output when the state changes
    state.change(
        fn=lambda s: s,
        inputs=[state],
        outputs=[text_output]
    )
    
    gr.Markdown("## ๐Ÿ“ Instructions")
    gr.Markdown("""
    1. Click the microphone button to start recording
    2. Speak clearly into your microphone
    3. The transcription will appear in real-time
    4. Click 'Clear Transcript' to start a new transcription
    """)

# Launch the app
if __name__ == "__main__":
    demo.launch()