import gradio as gr
import torch
import torchaudio
import numpy as np
from transformers import Wav2Vec2BertProcessor, Wav2Vec2BertForCTC

# Load model and processor
repo_id = "hriteshMaikap/marathi-asr-model"
processor = Wav2Vec2BertProcessor.from_pretrained(repo_id)
model = Wav2Vec2BertForCTC.from_pretrained(repo_id)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()  # Set to evaluation mode

def transcribe(audio_file):
    try:
        # Process audio
        waveform, sample_rate = torchaudio.load(audio_file)
        
        # Resample if needed
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
            waveform = resampler(waveform)
            sample_rate = 16000
        
        # Convert to mono if needed
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        # Convert to numpy
        speech_array = waveform.squeeze().numpy()
        
        # Process and run inference
        with torch.no_grad():
            inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt").to(device)
            logits = model(inputs.input_features).logits
            predicted_ids = torch.argmax(logits, dim=-1)
        
        # Decode the predicted IDs
        transcription = processor.decode(predicted_ids[0])
        
        return transcription
    except Exception as e:
        return f"Error processing audio: {str(e)}"

# Create Gradio interface with updated syntax
demo = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath"),  # Removed 'source' parameter
    outputs="text",
    title="Marathi Speech Recognition",
    description="Record your voice in Marathi and get a transcription. Click the microphone icon to start recording, then submit to transcribe."
)

demo.launch(show_error=True)