import os
import gradio as gr
import torch
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf
import time
import spaces

# Check if CUDA is available
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")

print(f"Model loaded on device: {model.device}")

import numpy as np
import soundfile as sf
audio_buffer = []

@spaces.GPU(duration=120)
def transcribe(audio, state=""):
    global model, audio_buffer
    if audio is None or isinstance(audio, int):
        print(f"Skipping invalid audio input: {type(audio)}")
        return state, state

    print(f"Received audio input of type: {type(audio)}")
    print(f"Audio shape: {audio.shape if isinstance(audio, np.ndarray) else 'N/A'}")
    # Append NumPy array to buffer
    if isinstance(audio, tuple):
        print(f"Tuple contents: {audio}")
        # Try extracting the first element
        audio = audio[1] if len(audio) > 1 else None
        
    if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[1], np.ndarray):
        # Handle tuple of (sample_rate, audio_array)
        print(f"Tuple contents: {audio}")
        sample_rate, audio_data = audio
        try:
                
            # Resample to 16kHz for NeMo
            if sample_rate != 16000:
                print(f"Resampling from {sample_rate}Hz to 16000Hz")
                audio_data = librosa.resample(audio_data.astype(float), orig_sr=sample_rate, target_sr=16000)
            # Save to temporary WAV file
            temp_file = "temp_audio.wav"
            sf.write(temp_file, audio_data, samplerate=16000)
            print(f"Processing temporary audio file: {temp_file}")
            transcription = model.transcribe([temp_file])[0]
            os.remove(temp_file)  # Clean up
            print("Temporary file removed.")
        except Exception as e:
            print(f"Error processing audio: {e}")
            # return state, state

        # Clear buffer
        audio_buffer = []
        new_state = state + " " + transcription if state else transcription
        return new_state, new_state
    return state, state

# Define the Gradio interface
with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
    gr.Markdown("# 🎙️ Real-time Speech-to-Text Transcription")
    gr.Markdown("Powered by NVIDIA NeMo and the parakeet-tdt-0.6b-v2 model")
    with gr.Row():
        with gr.Column(scale=2):
            audio_input = gr.Audio(
                sources=["microphone"], 
                type="numpy", 
                streaming=True,
                label="Speak into your microphone"
            )
            
            clear_btn = gr.Button("Clear Transcript")
            
        with gr.Column(scale=3):
            text_output = gr.Textbox(
                label="Transcription", 
                placeholder="Your speech will appear here...",
                lines=10
            )
            streaming_text = gr.Textbox(
                label="Real-time Transcription", 
                placeholder="Real-time results will appear here...",
                lines=2
            )
    
    # State to store the ongoing transcription
    state = gr.State("")
    
    # Handle the audio stream
    audio_input.stream(
        fn=transcribe,
        inputs=[audio_input, state],
        outputs=[state, streaming_text],
    )

    # Clear the transcription
    def clear_transcription():
        return "", "", ""
    
    clear_btn.click(
        fn=clear_transcription,
        inputs=[],
        outputs=[text_output, streaming_text, state]
    )
    
    # Update the main text output when the state changes
    state.change(
        fn=lambda s: s,
        inputs=[state],
        outputs=[text_output]
    )
    
    gr.Markdown("## 📝 Instructions")
    gr.Markdown("""
    1. Click the microphone button to start recording
    2. Speak clearly into your microphone
    3. The transcription will appear in real-time
    4. Click 'Clear Transcript' to start a new transcription
    """)

# Launch the app
if __name__ == "__main__":
    demo.launch()