from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq import librosa import torch import gradio as gr # Load Whisper model and processor print("Loading model...") processor = AutoProcessor.from_pretrained("jsbeaudry/whisper-medium-oswald") model = AutoModelForSpeechSeq2Seq.from_pretrained("jsbeaudry/whisper-medium-oswald") model.eval() # Set device (GPU if available, else CPU) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) print("Model loaded successfully.") # Transcription function def transcribe(audio): if audio is None: return "Please upload or record an audio file first." # Gradio provides a tuple (sr, data) sr, data = audio # If stereo, convert to mono if len(data.shape) == 2: data = librosa.to_mono(data.T) # Resample to 16kHz if needed if sr != 16000: data = librosa.resample(data, orig_sr=sr, target_sr=16000) sr = 16000 # Process audio input_features = processor(data, sampling_rate=sr, return_tensors="pt").input_features.to(device) # Predict with torch.no_grad(): predicted_ids = model.generate(input_features) # Decode transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] return transcription # Gradio UI def create_interface(): with gr.Blocks(title="Whisper Medium - Haitian Creole") as demo: gr.Markdown("# 🎙️ Whisper Medium Creole ASR") gr.Markdown( "Upload or record your voice in Haitian Creole. Then click **Transcribe** to get the text." ) with gr.Row(): audio_input = gr.Audio(label="🎧 Upload or Record Audio", type="numpy", format="wav") transcribe_button = gr.Button("🔍 Transcribe") output_text = gr.Textbox(label="📝 Transcribed Text", lines=4) transcribe_button.click(fn=transcribe, inputs=audio_input, outputs=output_text) return demo if __name__ == "__main__": interface = create_interface() interface.launch()