DontFreakOut commited on
Commit
4855128
·
1 Parent(s): d40ccca

updating logic

Browse files
Files changed (1) hide show
  1. app.py +12 -12
app.py CHANGED
@@ -28,25 +28,26 @@ esl_phoneme_pipe = pipeline("automatic-speech-recognition", model="mrrubino/wav2
28
  # Set up pipe for 2 accent classification models
29
  classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa", savedir="pretrained_models/accent-id-commonaccent_ecapa")
30
 
31
- def native_accent_classifier(file):
32
- out_prob, score, index, text_lab = classifier.classify_file(file)
33
  return [{'accent': text_lab[0], 'score': round(score[0],2)}]
34
 
35
- def esl_accent_classifier(file):
36
  esl_accent_pipe = pipeline(
37
  "audio-classification",
38
  model="kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2"
39
  )
40
- audio, sr = torchaudio.load(file) # Load audio
41
- audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(audio)
42
  audio = audio.squeeze().numpy()
43
  result = esl_accent_pipe(audio, top_k=6)
44
  return [{'accent': result[0]['label'], 'score': round(result[0]['score'],2)}]
45
 
46
- def transcribe_and_classify_speech(audio):
 
 
 
47
  try:
48
  asr_output = asr_pipe(
49
- audio,
50
  max_new_tokens=256,
51
  chunk_length_s=30,
52
  batch_size=8,
@@ -56,13 +57,13 @@ def transcribe_and_classify_speech(audio):
56
  asr_output = "Error, make sure your file is in mono format"
57
 
58
  try:
59
- american_phoneme_output = american_phoneme_pipe(audio)['text']
60
  except Exception as e:
61
  print(f"An error occurred with wav2vec2-xls-r-300m-timit-phoneme: {e}")
62
  american_phoneme_output = "Error, make sure your file is in mono format"
63
 
64
  try:
65
- esl_phoneme_output = esl_phoneme_pipe(audio)['text']
66
  except Exception as e:
67
  print(f"An error occurred with mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme: {e}")
68
  esl_phoneme_output = "Error"
@@ -97,8 +98,7 @@ examples = [['chinese-american.wav'], ['mexican.wav'], ['vietnamese.wav'], ['ind
97
  def create_transcription_interface(source):
98
  with gr.Blocks() as interface:
99
  gr.Markdown("""
100
- Input: Use microphone, upload .wav file, or choose an example below
101
- Output will include results from the following models:
102
  - Transcription from OpenAI's Whisper [openai/whisper-base.en](https://huggingface.co/openai/whisper-base.en)
103
  - Phonemic transcription trained on native English speakers [vitouphy/wav2vec2-xls-r-300m-timit-phoneme](https://huggingface.co/vitouphy/wav2vec2-xls-r-300m-timit-phoneme)
104
  - Phonemic transcription trained on speakers of English as a second language [mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme](https://huggingface.co/mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme)
@@ -119,7 +119,7 @@ file_transcribe = create_transcription_interface("upload")
119
  demo = gr.TabbedInterface(
120
  [mic_transcribe, file_transcribe],
121
  ["Microphone Input", "Upload .wav file"],
122
- title="Speech Transcription, Phonemic Transcription, and Accent Classification",
123
  )
124
 
125
  demo.launch(debug=True)
 
28
  # Set up pipe for 2 accent classification models
29
  classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa", savedir="pretrained_models/accent-id-commonaccent_ecapa")
30
 
31
+ def native_accent_classifier(audio):
32
+ out_prob, score, index, text_lab = classifier.classify_file(audio)
33
  return [{'accent': text_lab[0], 'score': round(score[0],2)}]
34
 
35
+ def esl_accent_classifier(audio):
36
  esl_accent_pipe = pipeline(
37
  "audio-classification",
38
  model="kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2"
39
  )
 
 
40
  audio = audio.squeeze().numpy()
41
  result = esl_accent_pipe(audio, top_k=6)
42
  return [{'accent': result[0]['label'], 'score': round(result[0]['score'],2)}]
43
 
44
+ def transcribe_and_classify_speech(file):
45
+ audio, sr = torchaudio.load(file) # Load audio
46
+ audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(audio)
47
+
48
  try:
49
  asr_output = asr_pipe(
50
+ file,
51
  max_new_tokens=256,
52
  chunk_length_s=30,
53
  batch_size=8,
 
57
  asr_output = "Error, make sure your file is in mono format"
58
 
59
  try:
60
+ american_phoneme_output = american_phoneme_pipe(file)['text']
61
  except Exception as e:
62
  print(f"An error occurred with wav2vec2-xls-r-300m-timit-phoneme: {e}")
63
  american_phoneme_output = "Error, make sure your file is in mono format"
64
 
65
  try:
66
+ esl_phoneme_output = esl_phoneme_pipe(file)['text']
67
  except Exception as e:
68
  print(f"An error occurred with mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme: {e}")
69
  esl_phoneme_output = "Error"
 
98
  def create_transcription_interface(source):
99
  with gr.Blocks() as interface:
100
  gr.Markdown("""
101
+ Use microphone, upload .wav file, or choose an example below. Output will include results from the following models:
 
102
  - Transcription from OpenAI's Whisper [openai/whisper-base.en](https://huggingface.co/openai/whisper-base.en)
103
  - Phonemic transcription trained on native English speakers [vitouphy/wav2vec2-xls-r-300m-timit-phoneme](https://huggingface.co/vitouphy/wav2vec2-xls-r-300m-timit-phoneme)
104
  - Phonemic transcription trained on speakers of English as a second language [mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme](https://huggingface.co/mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme)
 
119
  demo = gr.TabbedInterface(
120
  [mic_transcribe, file_transcribe],
121
  ["Microphone Input", "Upload .wav file"],
122
+ title="Speech Transcription and Accent Classification",
123
  )
124
 
125
  demo.launch(debug=True)