File size: 1,832 Bytes
23794e8
 
 
 
50fb5fd
 
23794e8
 
 
5f21810
50fb5fd
0ce1e34
 
5f21810
 
50fb5fd
 
 
0ce1e34
5f21810
 
23794e8
 
 
50fb5fd
23794e8
50fb5fd
 
23794e8
 
 
 
50fb5fd
 
 
 
 
 
 
 
 
 
 
 
 
0ce1e34
50fb5fd
23794e8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import torch
import gradio as gr
from transformers import pipeline

models = ["openai/whisper-small", "openai/whisper-base", "openai/whisper-medium", "openai/whisper-large"]
pipe = pipeline(task="automatic-speech-recognition",
                model="openai/whisper-small",
                device="cuda" if torch.cuda.is_available() else "cpu")


def initialize_pipeline(model_name):
    global pipe
    pipe = pipeline(task="automatic-speech-recognition",
                    model=model_name,
                    device="cuda" if torch.cuda.is_available() else "cpu")
    return model_name


def transcribe(audio):
    if audio is None:
        return "No audio input received. Please try again."
    text = pipe(audio)["text"]
    return text


interface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
    outputs="text",
    title="Whisper Small",
    description="Realtime demo for  Speech recognition using a Whisper small model.",
)

with gr.Blocks() as interface:
    # Dropdown to select the model
    model_dropdown = gr.Dropdown(choices=models, value=models[0], label="Select Model")
    # Audio input component
    audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload or Record Audio")
    # Text output component
    text_output = gr.Textbox(label="Transcribed Text")
    # Button to trigger transcription
    transcribe_button = gr.Button("Transcribe")
    # Event listener to initialize the pipeline when the model is selected
    model_dropdown.change(fn=initialize_pipeline, inputs=model_dropdown, outputs=None)
    # Event listener to transcribe the audio when the button is clicked
    transcribe_button.click(fn=transcribe, inputs=[audio_input], outputs=text_output)

if __name__ == "__main__":
    interface.launch()