Spaces:

kaysrubio
/

speech_transcribe_phonemes_and_accent

Running

App Files Files Community

DontFreakOut commited on Mar 6

Commit

4855128

1 Parent(s): d40ccca

updating logic

Browse files

Files changed (1) hide show

app.py +12 -12

app.py CHANGED Viewed

@@ -28,25 +28,26 @@ esl_phoneme_pipe = pipeline("automatic-speech-recognition", model="mrrubino/wav2
 # Set up pipe for 2 accent classification models
 classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa", savedir="pretrained_models/accent-id-commonaccent_ecapa")
-def native_accent_classifier(file):
-  out_prob, score, index, text_lab = classifier.classify_file(file)
   return [{'accent': text_lab[0], 'score': round(score[0],2)}]
-def esl_accent_classifier(file):
   esl_accent_pipe = pipeline(
     "audio-classification",
     model="kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2"
   )
-  audio, sr = torchaudio.load(file)  # Load audio
-  audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(audio)
   audio = audio.squeeze().numpy()
   result = esl_accent_pipe(audio, top_k=6)
   return [{'accent': result[0]['label'], 'score': round(result[0]['score'],2)}]
-def transcribe_and_classify_speech(audio):
   try:
       asr_output = asr_pipe(
-        audio,
         max_new_tokens=256,
         chunk_length_s=30,
         batch_size=8,
@@ -56,13 +57,13 @@ def transcribe_and_classify_speech(audio):
     asr_output = "Error, make sure your file is in mono format"
   try:
-    american_phoneme_output = american_phoneme_pipe(audio)['text']
   except Exception as e:
     print(f"An error occurred with wav2vec2-xls-r-300m-timit-phoneme: {e}")
     american_phoneme_output = "Error, make sure your file is in mono format"
   try:
-    esl_phoneme_output = esl_phoneme_pipe(audio)['text']
   except Exception as e:
     print(f"An error occurred with mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme: {e}")
     esl_phoneme_output = "Error"
@@ -97,8 +98,7 @@ examples = [['chinese-american.wav'], ['mexican.wav'], ['vietnamese.wav'], ['ind
 def create_transcription_interface(source):
     with gr.Blocks() as interface:
         gr.Markdown("""
-        Input: Use microphone, upload .wav file, or choose an example below
-        Output will include results from the following models:
           - Transcription from OpenAI's Whisper [openai/whisper-base.en](https://huggingface.co/openai/whisper-base.en)
           - Phonemic transcription trained on native English speakers [vitouphy/wav2vec2-xls-r-300m-timit-phoneme](https://huggingface.co/vitouphy/wav2vec2-xls-r-300m-timit-phoneme)
           - Phonemic transcription trained on speakers of English as a second language [mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme](https://huggingface.co/mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme)
@@ -119,7 +119,7 @@ file_transcribe = create_transcription_interface("upload")
 demo = gr.TabbedInterface(
     [mic_transcribe, file_transcribe],
     ["Microphone Input", "Upload .wav file"],
-    title="Speech Transcription, Phonemic Transcription, and Accent Classification",
 )
 demo.launch(debug=True)

 # Set up pipe for 2 accent classification models
 classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa", savedir="pretrained_models/accent-id-commonaccent_ecapa")
+def native_accent_classifier(audio):
+  out_prob, score, index, text_lab = classifier.classify_file(audio)
   return [{'accent': text_lab[0], 'score': round(score[0],2)}]
+def esl_accent_classifier(audio):
   esl_accent_pipe = pipeline(
     "audio-classification",
     model="kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2"
   )
   audio = audio.squeeze().numpy()
   result = esl_accent_pipe(audio, top_k=6)
   return [{'accent': result[0]['label'], 'score': round(result[0]['score'],2)}]
+def transcribe_and_classify_speech(file):
+  audio, sr = torchaudio.load(file)  # Load audio
+  audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(audio)
   try:
       asr_output = asr_pipe(
+        file,
         max_new_tokens=256,
         chunk_length_s=30,
         batch_size=8,
     asr_output = "Error, make sure your file is in mono format"
   try:
+    american_phoneme_output = american_phoneme_pipe(file)['text']
   except Exception as e:
     print(f"An error occurred with wav2vec2-xls-r-300m-timit-phoneme: {e}")
     american_phoneme_output = "Error, make sure your file is in mono format"
   try:
+    esl_phoneme_output = esl_phoneme_pipe(file)['text']
   except Exception as e:
     print(f"An error occurred with mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme: {e}")
     esl_phoneme_output = "Error"
 def create_transcription_interface(source):
     with gr.Blocks() as interface:
         gr.Markdown("""
+        Use microphone, upload .wav file, or choose an example below. Output will include results from the following models:
           - Transcription from OpenAI's Whisper [openai/whisper-base.en](https://huggingface.co/openai/whisper-base.en)
           - Phonemic transcription trained on native English speakers [vitouphy/wav2vec2-xls-r-300m-timit-phoneme](https://huggingface.co/vitouphy/wav2vec2-xls-r-300m-timit-phoneme)
           - Phonemic transcription trained on speakers of English as a second language [mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme](https://huggingface.co/mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme)
 demo = gr.TabbedInterface(
     [mic_transcribe, file_transcribe],
     ["Microphone Input", "Upload .wav file"],
+    title="Speech Transcription and Accent Classification",
 )
 demo.launch(debug=True)