import gradio as gr import os import subprocess try: from transformers import pipeline except ModuleNotFoundError: print("Installing transformers...") subprocess.check_call(["pip", "install", "transformers"]) from transformers import pipeline # Retry import import torch import torchaudio from speechbrain.pretrained import EncoderClassifier # Set up pipe for whisper asr asr_pipe = pipeline( "automatic-speech-recognition", model="openai/whisper-base.en", torch_dtype=torch.float32, device="cpu", ) # Set up pipe for 2 phonemic transcription models american_phoneme_pipe = pipeline("automatic-speech-recognition", model="vitouphy/wav2vec2-xls-r-300m-timit-phoneme") esl_phoneme_pipe = pipeline("automatic-speech-recognition", model="mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme") # Set up pipe for 2 accent classification models classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa", savedir="pretrained_models/accent-id-commonaccent_ecapa") def native_accent_classifier(file): out_prob, score, index, text_lab = classifier.classify_file(file) rounded_score = round(score.item(), 2) return [{'accent': text_lab[0], 'score': rounded_score}] def esl_accent_classifier(file): esl_accent_pipe = pipeline( "audio-classification", model="kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2" ) audio, sr = torchaudio.load(file) # Load audio audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(audio) audio = audio.squeeze().numpy() result = esl_accent_pipe(audio, top_k=6) return [{'accent': result[0]['label'], 'score': round(result[0]['score'],2)}] def transcribe_and_classify_speech(file): try: asr_output = asr_pipe( file, max_new_tokens=256, chunk_length_s=30, batch_size=8, )["text"] except Exception as e: print(f"An error occurred with openai/whisper-base.en: {e}") asr_output = "Error, make sure your file is in mono format" try: american_phoneme_output = american_phoneme_pipe(file)['text'] except Exception as e: print(f"An error occurred with wav2vec2-xls-r-300m-timit-phoneme: {e}") american_phoneme_output = "Error, make sure your file is in mono format" try: esl_phoneme_output = esl_phoneme_pipe(file)['text'] except Exception as e: print(f"An error occurred with mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme: {e}") esl_phoneme_output = "Error" try: native_accent_output = native_accent_classifier(file) except Exception as e: print(f"An error occurred with Jzuluaga/accent-id-commonaccent_ecapa: {e}") native_accent_output = [{'accent': "Error"}, {'score': .0}] try: esl_accent_output = esl_accent_classifier(file) except Exception as e: print(f"An error occurred with kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2: {e}") esl_accent_output = [{'accent': 'Unknown-please upload single channel audio'}, {'score': .0}] output = [ {'transcription': asr_output}, {'phonemes_native_eng': american_phoneme_output}, {'phonemes_eng_second_lang': esl_phoneme_output}, {'native_eng_country': native_accent_output}, {'first_lang_if_not_eng': esl_accent_output} ] return output ## Set up gradio app demo = gr.Blocks() examples = [['chinese-american.wav'], ['mexican.wav'], ['vietnamese.wav'], ['indian.wav'], ['nigerian.wav'], ['irish.wav']] # Create a function to generate a vertically stacked interface def create_transcription_interface(source): with gr.Blocks() as interface: gr.Markdown(""" Use microphone, upload .wav file, or choose an example below. Output will include results from the following models: - Transcription from OpenAI's Whisper [openai/whisper-base.en](https://huggingface.co/openai/whisper-base.en) - Phonemic transcription trained on native English speakers [vitouphy/wav2vec2-xls-r-300m-timit-phoneme](https://huggingface.co/vitouphy/wav2vec2-xls-r-300m-timit-phoneme) - Phonemic transcription trained on speakers of English as a second language [mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme](https://huggingface.co/mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme) - Accent classification trained on native English speakers [Jzuluaga/accent-id-commonaccent_ecapa](https://huggingface.co/Jzuluaga/accent-id-commonaccent_ecapa) - Accent classification trained on speakers of English as a second language [kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2](https://huggingface.co/kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2) """) with gr.Column(): audio_input = gr.Audio(sources=source, type="filepath", label="Upload Audio") output = gr.JSON(label="Results") audio_input.change(fn=transcribe_and_classify_speech, inputs=audio_input, outputs=output) gr.Examples(examples=examples, inputs=[audio_input]) return interface # Create two interfaces (one for mic, one for file upload) mic_transcribe = create_transcription_interface("microphone") file_transcribe = create_transcription_interface("upload") demo = gr.TabbedInterface( [mic_transcribe, file_transcribe], ["Microphone Input", "Upload .wav file"], title="Speech Recognition and Accent Classification", ) demo.launch() # demo.launch(debug=True)