import torch import gradio as gr from transformers import pipeline models = ["openai/whisper-small", "openai/whisper-base", "openai/whisper-medium", "openai/whisper-large"] pipe = pipeline(task="automatic-speech-recognition", model="openai/whisper-small", device="cuda" if torch.cuda.is_available() else "cpu") def initialize_pipeline(model_name): global pipe pipe = pipeline(task="automatic-speech-recognition", model=model_name, device="cuda" if torch.cuda.is_available() else "cpu") return model_name def transcribe(audio): if audio is None: return "No audio input received. Please try again." text = pipe(audio)["text"] return text interface = gr.Interface( fn=transcribe, inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"), outputs="text", title="Whisper Small", description="Realtime demo for Speech recognition using a Whisper small model.", ) with gr.Blocks() as interface: # Dropdown to select the model model_dropdown = gr.Dropdown(choices=models, value=models[0], label="Select Model") # Audio input component audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload or Record Audio") # Text output component text_output = gr.Textbox(label="Transcribed Text") # Button to trigger transcription transcribe_button = gr.Button("Transcribe") # Event listener to initialize the pipeline when the model is selected model_dropdown.change(fn=initialize_pipeline, inputs=model_dropdown, outputs=None) # Event listener to transcribe the audio when the button is clicked transcribe_button.click(fn=transcribe, inputs=[audio_input], outputs=text_output) if __name__ == "__main__": interface.launch()