File size: 4,246 Bytes
18b21ee
e888ead
 
18b21ee
 
 
f334b99
e888ead
18b21ee
 
 
 
e888ead
f334b99
18b21ee
f334b99
18b21ee
779d79b
 
 
 
 
18b21ee
779d79b
 
 
18b21ee
 
b6fdfee
 
779d79b
615813d
 
 
ce364ed
615813d
ac5f4c0
 
 
 
c699992
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
779d79b
ac5f4c0
 
 
 
779d79b
18b21ee
 
 
 
 
40dfec3
18b21ee
 
8c34a9f
18b21ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0011522
18b21ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e888ead
18b21ee
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
import gradio as gr
import torch
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf
import time
import spaces

# Check if CUDA is available
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")

print(f"Model loaded on device: {model.device}")

import numpy as np
import soundfile as sf
audio_buffer = []

@spaces.GPU(duration=120)
def transcribe(audio, state=""):
    global model, audio_buffer
    if audio is None or isinstance(audio, int):
        print(f"Skipping invalid audio input: {type(audio)}")
        return state, state

    print(f"Received audio input of type: {type(audio)}")
    print(f"Audio shape: {audio.shape if isinstance(audio, np.ndarray) else 'N/A'}")
    # Append NumPy array to buffer
    if isinstance(audio, tuple):
        print(f"Tuple contents: {audio}")
        # Try extracting the first element
        audio = audio[1] if len(audio) > 1 else None
        
    if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[1], np.ndarray):
        # Handle tuple of (sample_rate, audio_array)
        print(f"Tuple contents: {audio}")
        sample_rate, audio_data = audio
        try:
                
            # Resample to 16kHz for NeMo
            if sample_rate != 16000:
                print(f"Resampling from {sample_rate}Hz to 16000Hz")
                audio_data = librosa.resample(audio_data.astype(float), orig_sr=sample_rate, target_sr=16000)
            # Save to temporary WAV file
            temp_file = "temp_audio.wav"
            sf.write(temp_file, audio_data, samplerate=16000)
            print(f"Processing temporary audio file: {temp_file}")
            transcription = model.transcribe([temp_file])[0]
            os.remove(temp_file)  # Clean up
            print("Temporary file removed.")
        except Exception as e:
            print(f"Error processing audio: {e}")
            # return state, state

        # Clear buffer
        audio_buffer = []
        new_state = state + " " + transcription if state else transcription
        return new_state, new_state
    return state, state

# Define the Gradio interface
with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
    gr.Markdown("# ๐ŸŽ™๏ธ Real-time Speech-to-Text Transcription")
    gr.Markdown("Powered by NVIDIA NeMo and the parakeet-tdt-0.6b-v2 model")
    with gr.Row():
        with gr.Column(scale=2):
            audio_input = gr.Audio(
                sources=["microphone"], 
                type="numpy", 
                streaming=True,
                label="Speak into your microphone"
            )
            
            clear_btn = gr.Button("Clear Transcript")
            
        with gr.Column(scale=3):
            text_output = gr.Textbox(
                label="Transcription", 
                placeholder="Your speech will appear here...",
                lines=10
            )
            streaming_text = gr.Textbox(
                label="Real-time Transcription", 
                placeholder="Real-time results will appear here...",
                lines=2
            )
    
    # State to store the ongoing transcription
    state = gr.State("")
    
    # Handle the audio stream
    audio_input.stream(
        fn=transcribe,
        inputs=[audio_input, state],
        outputs=[state, streaming_text],
    )

    # Clear the transcription
    def clear_transcription():
        return "", "", ""
    
    clear_btn.click(
        fn=clear_transcription,
        inputs=[],
        outputs=[text_output, streaming_text, state]
    )
    
    # Update the main text output when the state changes
    state.change(
        fn=lambda s: s,
        inputs=[state],
        outputs=[text_output]
    )
    
    gr.Markdown("## ๐Ÿ“ Instructions")
    gr.Markdown("""
    1. Click the microphone button to start recording
    2. Speak clearly into your microphone
    3. The transcription will appear in real-time
    4. Click 'Clear Transcript' to start a new transcription
    """)

# Launch the app
if __name__ == "__main__":
    demo.launch()