DontFreakOut
commited on
Commit
·
4855128
1
Parent(s):
d40ccca
updating logic
Browse files
app.py
CHANGED
@@ -28,25 +28,26 @@ esl_phoneme_pipe = pipeline("automatic-speech-recognition", model="mrrubino/wav2
|
|
28 |
# Set up pipe for 2 accent classification models
|
29 |
classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa", savedir="pretrained_models/accent-id-commonaccent_ecapa")
|
30 |
|
31 |
-
def native_accent_classifier(
|
32 |
-
out_prob, score, index, text_lab = classifier.classify_file(
|
33 |
return [{'accent': text_lab[0], 'score': round(score[0],2)}]
|
34 |
|
35 |
-
def esl_accent_classifier(
|
36 |
esl_accent_pipe = pipeline(
|
37 |
"audio-classification",
|
38 |
model="kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2"
|
39 |
)
|
40 |
-
audio, sr = torchaudio.load(file) # Load audio
|
41 |
-
audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(audio)
|
42 |
audio = audio.squeeze().numpy()
|
43 |
result = esl_accent_pipe(audio, top_k=6)
|
44 |
return [{'accent': result[0]['label'], 'score': round(result[0]['score'],2)}]
|
45 |
|
46 |
-
def transcribe_and_classify_speech(
|
|
|
|
|
|
|
47 |
try:
|
48 |
asr_output = asr_pipe(
|
49 |
-
|
50 |
max_new_tokens=256,
|
51 |
chunk_length_s=30,
|
52 |
batch_size=8,
|
@@ -56,13 +57,13 @@ def transcribe_and_classify_speech(audio):
|
|
56 |
asr_output = "Error, make sure your file is in mono format"
|
57 |
|
58 |
try:
|
59 |
-
american_phoneme_output = american_phoneme_pipe(
|
60 |
except Exception as e:
|
61 |
print(f"An error occurred with wav2vec2-xls-r-300m-timit-phoneme: {e}")
|
62 |
american_phoneme_output = "Error, make sure your file is in mono format"
|
63 |
|
64 |
try:
|
65 |
-
esl_phoneme_output = esl_phoneme_pipe(
|
66 |
except Exception as e:
|
67 |
print(f"An error occurred with mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme: {e}")
|
68 |
esl_phoneme_output = "Error"
|
@@ -97,8 +98,7 @@ examples = [['chinese-american.wav'], ['mexican.wav'], ['vietnamese.wav'], ['ind
|
|
97 |
def create_transcription_interface(source):
|
98 |
with gr.Blocks() as interface:
|
99 |
gr.Markdown("""
|
100 |
-
|
101 |
-
Output will include results from the following models:
|
102 |
- Transcription from OpenAI's Whisper [openai/whisper-base.en](https://huggingface.co/openai/whisper-base.en)
|
103 |
- Phonemic transcription trained on native English speakers [vitouphy/wav2vec2-xls-r-300m-timit-phoneme](https://huggingface.co/vitouphy/wav2vec2-xls-r-300m-timit-phoneme)
|
104 |
- Phonemic transcription trained on speakers of English as a second language [mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme](https://huggingface.co/mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme)
|
@@ -119,7 +119,7 @@ file_transcribe = create_transcription_interface("upload")
|
|
119 |
demo = gr.TabbedInterface(
|
120 |
[mic_transcribe, file_transcribe],
|
121 |
["Microphone Input", "Upload .wav file"],
|
122 |
-
title="Speech Transcription
|
123 |
)
|
124 |
|
125 |
demo.launch(debug=True)
|
|
|
28 |
# Set up pipe for 2 accent classification models
|
29 |
classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa", savedir="pretrained_models/accent-id-commonaccent_ecapa")
|
30 |
|
31 |
+
def native_accent_classifier(audio):
|
32 |
+
out_prob, score, index, text_lab = classifier.classify_file(audio)
|
33 |
return [{'accent': text_lab[0], 'score': round(score[0],2)}]
|
34 |
|
35 |
+
def esl_accent_classifier(audio):
|
36 |
esl_accent_pipe = pipeline(
|
37 |
"audio-classification",
|
38 |
model="kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2"
|
39 |
)
|
|
|
|
|
40 |
audio = audio.squeeze().numpy()
|
41 |
result = esl_accent_pipe(audio, top_k=6)
|
42 |
return [{'accent': result[0]['label'], 'score': round(result[0]['score'],2)}]
|
43 |
|
44 |
+
def transcribe_and_classify_speech(file):
|
45 |
+
audio, sr = torchaudio.load(file) # Load audio
|
46 |
+
audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(audio)
|
47 |
+
|
48 |
try:
|
49 |
asr_output = asr_pipe(
|
50 |
+
file,
|
51 |
max_new_tokens=256,
|
52 |
chunk_length_s=30,
|
53 |
batch_size=8,
|
|
|
57 |
asr_output = "Error, make sure your file is in mono format"
|
58 |
|
59 |
try:
|
60 |
+
american_phoneme_output = american_phoneme_pipe(file)['text']
|
61 |
except Exception as e:
|
62 |
print(f"An error occurred with wav2vec2-xls-r-300m-timit-phoneme: {e}")
|
63 |
american_phoneme_output = "Error, make sure your file is in mono format"
|
64 |
|
65 |
try:
|
66 |
+
esl_phoneme_output = esl_phoneme_pipe(file)['text']
|
67 |
except Exception as e:
|
68 |
print(f"An error occurred with mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme: {e}")
|
69 |
esl_phoneme_output = "Error"
|
|
|
98 |
def create_transcription_interface(source):
|
99 |
with gr.Blocks() as interface:
|
100 |
gr.Markdown("""
|
101 |
+
Use microphone, upload .wav file, or choose an example below. Output will include results from the following models:
|
|
|
102 |
- Transcription from OpenAI's Whisper [openai/whisper-base.en](https://huggingface.co/openai/whisper-base.en)
|
103 |
- Phonemic transcription trained on native English speakers [vitouphy/wav2vec2-xls-r-300m-timit-phoneme](https://huggingface.co/vitouphy/wav2vec2-xls-r-300m-timit-phoneme)
|
104 |
- Phonemic transcription trained on speakers of English as a second language [mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme](https://huggingface.co/mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme)
|
|
|
119 |
demo = gr.TabbedInterface(
|
120 |
[mic_transcribe, file_transcribe],
|
121 |
["Microphone Input", "Upload .wav file"],
|
122 |
+
title="Speech Transcription and Accent Classification",
|
123 |
)
|
124 |
|
125 |
demo.launch(debug=True)
|