File size: 1,158 Bytes
199521e
17333b7
10df649
152f47a
17333b7
2c67b9f
df1bd47
bc83513
 
10df649
bc83513
 
cf27605
 
 
 
 
 
 
 
bc83513
 
cf27605
 
 
 
 
 
bd84b6c
a3912e0
cf27605
 
10df649
 
5f35e8d
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#from transformers import pipeline
import gradio as gr
#import nemo.collections.asr as nemo_asr
#import gradio

#model = pipeline("automatic-speech-recognition")
#model = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
#model = pipeline("automatic-speech-recognition", model="nvidia/parakeet-ctc-0.6b")

'''
asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="nvidia/parakeet-ctc-1.1b")


def transcribe_audio(mic=None, file=None):
    if mic is not None:
        audio = mic
    elif file is not None:
        audio = file
    else:
        return "You must either provide a mic recording or a file"
    #transcription = model(audio)["text"]
    transcription = asr_model(audio)
    return transcription


gr.Interface(
    fn=transcribe_audio,
    inputs=[
        gr.Audio(sources="microphone", type="filepath"),
        gr.Audio(sources="upload", type="filepath"),
    ],
    outputs="text",
).launch(share=True)
'''
#gr.load("models/nvidia/parakeet-ctc-1.1b").launch()
#gr.load("models/openai/whisper-medium.en").launch(share=True)
gr.load("models/nvidia/stt_en_fastconformer_ctc_large").launch(share=True)