File size: 13,982 Bytes
18b21ee
e888ead
 
18b21ee
 
 
f334b99
26fc80f
e888ead
a2ed037
 
 
2647bd6
18b21ee
2647bd6
7925ca5
2647bd6
 
a2ed037
2647bd6
 
 
 
 
 
 
 
7925ca5
 
 
2647bd6
 
 
 
 
 
a2ed037
2647bd6
a2ed037
18b21ee
779d79b
2647bd6
a2ed037
692769a
 
8505a8f
 
2647bd6
b407959
 
a2ed037
779d79b
 
b407959
 
b6fdfee
ac5f4c0
 
b407959
 
 
 
 
 
 
 
 
 
f374409
 
 
 
 
 
b407959
 
 
f374409
8505a8f
b407959
 
 
 
c699992
 
b407959
 
 
 
 
f374409
7925ca5
 
 
f374409
 
b407959
 
 
 
 
 
 
 
7925ca5
b407959
 
 
 
 
 
 
 
 
f374409
b407959
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d5b897
b407959
c699992
 
b407959
 
 
 
18b21ee
2b5f9bc
 
 
 
 
 
 
 
 
 
 
 
 
6dbf680
2b5f9bc
 
 
 
 
 
 
 
 
 
 
 
 
 
18b21ee
 
 
2647bd6
 
 
 
 
 
 
 
 
 
 
f374409
2647bd6
 
f374409
 
 
 
 
2647bd6
2b5f9bc
 
bde342d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b5f9bc
 
 
 
 
 
 
 
04c6dbe
 
 
 
2b5f9bc
 
6dbf680
2b5f9bc
 
 
 
 
 
 
 
 
 
 
 
8d5b897
18b21ee
b407959
 
2647bd6
 
 
2b5f9bc
2647bd6
2b5f9bc
 
 
 
 
 
 
 
 
 
 
 
2647bd6
 
 
 
 
 
 
f374409
 
18b21ee
 
2647bd6
b407959
2b5f9bc
 
 
 
 
 
 
6dbf680
 
 
8d5b897
6dbf680
 
 
 
 
 
 
 
 
 
8d5b897
 
 
 
 
 
 
 
6dbf680
 
 
 
 
 
8d5b897
 
 
 
 
 
 
 
 
 
2b5f9bc
18b21ee
8d5b897
18b21ee
8d5b897
 
6dbf680
 
2b5f9bc
2b1f9fe
 
2b5f9bc
2647bd6
 
 
 
 
6dbf680
2b5f9bc
 
 
 
 
 
 
 
6dbf680
18b21ee
e888ead
18b21ee
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
import os
import gradio as gr
import torch
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf
import time
import spaces
import librosa

# Important: Don't initialize CUDA in the main process for Spaces
# The model will be loaded in the worker process through the GPU decorator
model = None
current_model_name = "nvidia/parakeet-tdt-0.6b-v2"

# Available models
available_models = ["nvidia/parakeet-tdt-0.6b-v2","nvidia/parakeet-tdt-1.1b"]

def load_model(model_name=None):
    # This function will be called in the GPU worker process
    global model, current_model_name
    
    # Use the specified model name or the current one
    model_name = model_name or current_model_name
    
    # Check if we need to load a new model
    if model is None or model_name != current_model_name:
        print(f"Loading model {model_name} in worker process")
        # print(f"CUDA available: {torch.cuda.is_available()}")
        # if torch.cuda.is_available():
        #     print(f"CUDA device: {torch.cuda.get_device_name(0)}")
        
        # Update the current model name
        current_model_name = model_name
        
        # Load the selected model
        model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(model_name)
        print(f"Model loaded on device: {model.device}")
    
    return model

@spaces.GPU(duration=120)
def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_buffer=None, last_processed_time=0):
    # Load the model inside the GPU worker process
    import numpy as np
    import soundfile as sf
    import librosa
    import os
    model = load_model(model_name)
    if audio_buffer is None:
        audio_buffer = []
    
    if audio is None or isinstance(audio, int):
        print(f"Skipping invalid audio input: {type(audio)}")
        return state, state, audio_buffer, last_processed_time
    
    print(f"Received audio input of type: {type(audio)}")
    if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[1], np.ndarray):
        sample_rate, audio_data = audio
        print(f"Sample rate: {sample_rate}, Audio shape: {audio_data.shape}")
        
        # Append chunk to buffer
        audio_buffer.append(audio_data)
        
        # Calculate total duration in seconds
        total_samples = sum(arr.shape[0] for arr in audio_buffer)
        total_duration = total_samples / sample_rate
        print(f"Total buffered duration: {total_duration:.2f}s")
        
        # Process 5-second chunks with 2-second step size (3-second overlap)
        # Using longer chunks usually helps with transcription accuracy
        chunk_duration = 5.0  # seconds (increased from 2.0)
        step_size = 2.0      # seconds (increased from 1.0)
        # min_samples = int(chunk_duration * 16000)  # 5s at 16kHz

        if total_duration < chunk_duration:
            print(f"Buffering audio, total duration: {total_duration:.2f}s")
            return state, state, audio_buffer, last_processed_time

        try:
            # Concatenate buffered chunks
            full_audio = np.concatenate(audio_buffer)
            
            # Resample to 16kHz if needed
            if sample_rate != 16000:
                print(f"Resampling from {sample_rate}Hz to 16000Hz")
                full_audio = librosa.resample(full_audio.astype(float), orig_sr=sample_rate, target_sr=16000)
                sample_rate = 16000
            else:
                full_audio = full_audio.astype(float)
            
            # Normalize audio (helps with consistent volume levels)
            # if np.abs(full_audio).max() > 0:
            #     full_audio = full_audio / np.abs(full_audio).max() * 0.9
            #     print("Audio normalized to improve transcription")
            
            # Process chunks
            new_state = state
            current_time = last_processed_time
            total_samples_16k = len(full_audio)
            
            while current_time + chunk_duration <= total_duration:
                start_sample = int(current_time * sample_rate)
                end_sample = int((current_time + chunk_duration) * sample_rate)
                if end_sample > total_samples_16k:
                    end_sample = total_samples_16k
                
                chunk = full_audio[start_sample:end_sample]
                print(f"Processing chunk from {current_time:.2f}s to {current_time + chunk_duration:.2f}s")
                
                # Save to temporary WAV file
                temp_file = "temp_audio.wav"
                sf.write(temp_file, chunk, samplerate=16000)
                
                # Transcribe
                print(f"Transcribing chunk of duration {chunk_duration}s...")
                hypothesis = model.transcribe([temp_file])[0]
                transcription = hypothesis.text
                print(f"Transcription: {transcription}")
                
                os.remove(temp_file)
                print("Temporary file removed.")
                
                # Append transcription if non-empty
                if transcription.strip():
                    new_state = new_state + " " + transcription if new_state else transcription
                
                current_time += step_size
            
            # Update last processed time
            last_processed_time = current_time
            
            # Trim buffer to keep only unprocessed audio
            keep_samples = int((total_duration - current_time) * sample_rate)
            if keep_samples > 0:
                audio_buffer = [full_audio[-keep_samples:]]
            else:
                audio_buffer = []
            print(f"New state: {new_state}")
            return new_state, transcription, audio_buffer, last_processed_time  # Return last transcription for streaming_text
        
        except Exception as e:
            print(f"Error processing audio: {e}")
            return state, state, audio_buffer, last_processed_time
    
    print(f"Invalid audio input format: {type(audio)}")
    return state, state, audio_buffer, last_processed_time

@spaces.GPU(duration=120)
def transcribe_file(audio_file, model_name="nvidia/parakeet-tdt-0.6b-v2"):
    # Load the model inside the GPU worker process
    import numpy as np
    import soundfile as sf
    import librosa
    import os
    
    # Check if audio file is provided
    if audio_file is None:
        return "No audio file provided. Please upload an audio file."
    
    try:
        global model
        model = load_model(model_name)
        
        print(f"Processing file: {audio_file}")
        
        # Transcribe the entire file at once
        hypothesis = model.transcribe([audio_file])[0]
        transcription = hypothesis.text
        print(f"File transcription: {transcription}")
        
        return transcription
    except Exception as e:
        print(f"Error transcribing file: {e}")
        return f"Error transcribing file: {str(e)}"

# Define the Gradio interface
with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
    gr.Markdown("# ๐ŸŽ™๏ธ Real-time Speech-to-Text Transcription")
    gr.Markdown("Powered by NVIDIA NeMo")
    
    # Model selection and loading
    with gr.Row():
        with gr.Column(scale=3):
            model_dropdown = gr.Dropdown(
                choices=available_models, 
                value=current_model_name,
                label="Select ASR Model"
            )
        with gr.Column(scale=1):
            load_button = gr.Button("Load Selected Model", elem_id="load-button", elem_classes=["btn-blue"])
    
    # Status indicator for model loading
    model_status = gr.Textbox(
        value=f"Current model: {current_model_name}", 
        label="Model Status",
        container=False
    )
    
    # Create tabs for real-time and file-based transcription
    with gr.Tabs():
        
        # File-based transcription tab
        with gr.TabItem("File Transcription"):
            with gr.Row():
                with gr.Column(scale=2):
                    # Audio recorder that saves to file
                    audio_recorder = gr.Audio(
                        sources=["microphone"],
                        type="filepath",
                        label="Record or upload audio file"
                    )
                    with gr.Row():
                        transcribe_btn = gr.Button("Transcribe Audio File", variant="primary")
                        clear_file_btn = gr.Button("Clear Transcript", variant="secondary")
                
                with gr.Column(scale=3):
                    file_transcription = gr.Textbox(
                        label="File Transcription", 
                        placeholder="Transcription will appear here after clicking 'Transcribe Audio File'",
                        lines=10
                    )
        # Real-time transcription tab
        with gr.TabItem("Real-time Transcription"):
            with gr.Row():
                with gr.Column(scale=2):
                    audio_input = gr.Audio(
                        sources=["microphone"], 
                        type="numpy", 
                        streaming=True,
                        label="Speak into your microphone",
                        waveform_options=gr.WaveformOptions(
                            sample_rate=16000
                        )
                    )
                    
                clear_btn = gr.Button("Clear Transcript", variant="secondary")
                    
                with gr.Column(scale=3):
                    text_output = gr.Textbox(
                        label="Transcription", 
                        placeholder="Your speech will appear here...",
                        lines=10
                    )
                    streaming_text = gr.Textbox(
                        label="Real-time Transcription", 
                        placeholder="Real-time results will appear here...",
                        lines=2
                    )
    # State to store the ongoing transcription
    state = gr.State("")
    audio_buffer = gr.State(value=None)
    last_processed_time = gr.State(value=0)
    
    # Function to handle model selection
    def update_model(model_name):
        global current_model_name, model
        current_model_name = model_name
        
        # Load the model immediately if we're in a GPU context
        try:
            # This will load the model in the GPU worker
            model = load_model(model_name)
            status_message = f"Current model: {model_name} (loaded)"
            print(f"Model {model_name} loaded successfully")
        except Exception as e:
            status_message = f"Current model: {model_name} (will be loaded on first use)"
            print(f"Model will be loaded on first use: {e}")
            
        return status_message, None, 0  # Reset audio buffer and last processed time
    
    # Load model button event
    load_button.click(
        fn=update_model,
        inputs=[model_dropdown],
        outputs=[model_status, audio_buffer, last_processed_time]
    )
    # Handle the audio stream for real-time transcription
    streaming_text = gr.State(value="")
    audio_input.stream(
        fn=transcribe,
        inputs=[audio_input, model_dropdown, state, audio_buffer, last_processed_time],
        outputs=[state, streaming_text, audio_buffer, last_processed_time],
    )
    
    # Handle file transcription
    transcribe_btn.click(
        fn=transcribe_file,
        inputs=[audio_recorder, model_dropdown],
        outputs=[file_transcription]
    )    # Clear the real-time transcription
    def clear_transcription():
        print("Clearing real-time transcription")
        return "", "", None, 0  # Return empty values for state, text_output, audio_buffer, and last_processed_time
    
    # Clear the file transcription
    def clear_file_transcription():
        print("Clearing file transcription")
        return ""  # Clear file_transcription
    
    # Set up clear button event handlers
    clear_btn.click(
        fn=clear_transcription,
        inputs=[],
        outputs=[state, text_output, audio_buffer, last_processed_time]
    )
    
    # Also clear streaming_text when clearing the transcription
    clear_btn.click(
        fn=lambda: "",
        inputs=[],
        outputs=[streaming_text]
    )
    
    clear_file_btn.click(
        fn=clear_file_transcription,
        inputs=[],
        outputs=[file_transcription]
    )    # Update the main text output when the state changes
    def update_output(transcript):
        # For streaming_text, show just the last few words or chunks
        words = transcript.split()
        if len(words) > 15:
            streaming_text = " ".join(words[-15:])
        else:
            streaming_text = transcript
        
        return transcript, streaming_text
    
    state.change(
        fn=update_output,
        inputs=[state],
        outputs=[text_output, streaming_text]
    )


    
    gr.Markdown("## ๐Ÿ“ Instructions")
    gr.Markdown("""
    ### Real-time Transcription:
    1. Select an ASR model from the dropdown menu
    2. Click 'Load Selected Model' to load the model
    3. Click the microphone button to start recording
    4. Speak clearly into your microphone
    5. The transcription will appear in real-time
    6. Click 'Clear Transcript' to reset the transcription
    
    ### File Transcription:
    1. Select an ASR model from the dropdown menu
    2. Click 'Load Selected Model' to load the model
    3. Switch to the 'File Transcription' tab
    4. Record audio by clicking the microphone button or upload an existing audio file
    5. Click 'Transcribe Audio File' to process the recording
    6. The complete transcription will appear in the text box
    7. Click 'Clear Transcript' to reset the file transcription
    """)

# Launch the app
if __name__ == "__main__":
    demo.launch()