Spaces:

KDM999
/

asr-multimodel-comparison

Running

App Files Files Community

KDM999 commited on about 1 month ago

Commit

b6c2d8b

verified ·

1 Parent(s): 9b1fd29

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -11

app.py CHANGED Viewed

@@ -20,10 +20,15 @@ accents = sorted(set(entry["accent"] for entry in data))
 # Load ASR pipelines
 device = 0
-pipe_whisper_medium = pipeline("automatic-speech-recognition", model="openai/whisper-medium", device=device, generate_kwargs={"language": "en"})
-pipe_whisper_base = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device, generate_kwargs={"language": "en"})
-pipe_whisper_tiny = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=device, generate_kwargs={"language": "en"})
 pipe_wav2vec2_base_960h = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=device)
 pipe_hubert_large_ls960_ft = pipeline("automatic-speech-recognition", model="facebook/hubert-large-ls960-ft", device=device)
 # Functions
@@ -84,10 +89,15 @@ def transcribe_audio(file_path):
     outputs = {}
     models = {
-        "openai/whisper-medium": pipe_whisper_medium,
-        "openai/whisper-base": pipe_whisper_base,
         "openai/whisper-tiny": pipe_whisper_tiny,
         "facebook/wav2vec2-base-960h": pipe_wav2vec2_base_960h,
         "facebook/hubert-large-ls960-ft": pipe_hubert_large_ls960_ft,
     }
@@ -122,10 +132,16 @@ with gr.Blocks() as demo:
     transcribe_btn = gr.Button("Transcribe with All Models")
     gold_text = gr.Textbox(label="Reference (Gold Standard)")
-    whisper_medium_html = gr.HTML(label="Whisper Medium")
-    whisper_base_html = gr.HTML(label="Whisper Base")
     whisper_tiny_html = gr.HTML(label="Whisper Tiny")
-    wav2vec_html = gr.HTML(label="Wav2Vec2 Base")
     hubert_html = gr.HTML(label="HuBERT Large")
     transcribe_btn.click(
@@ -133,10 +149,15 @@ with gr.Blocks() as demo:
         inputs=[file_path_output],
         outputs=[
             gold_text,
-            whisper_medium_html,
-            whisper_base_html,
             whisper_tiny_html,
-            wav2vec_html,
             hubert_html,
         ],
     )

 # Load ASR pipelines
 device = 0
+pipe_whisper_tiny = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=device)
+pipe_whisper_tiny_en = pipeline("automatic-speech-recognition", model="openai/whisper-tiny.en", device=device)
+pipe_whisper_base = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
+pipe_whisper_base_en = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", device=device)
+pipe_whisper_medium = pipeline("automatic-speech-recognition", model="openai/whisper-medium", device=device)
+pipe_whisper_medium_en = pipeline("automatic-speech-recognition", model="openai/whisper-medium.en", device=device)
+pipe_distil_whisper_large = pipeline("automatic-speech-recognition", model="distil-whisper/distil-large-v3.5", device=device)
 pipe_wav2vec2_base_960h = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=device)
+pipe_wav2vec2_large_960h = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-960h", device=device)
 pipe_hubert_large_ls960_ft = pipeline("automatic-speech-recognition", model="facebook/hubert-large-ls960-ft", device=device)
 # Functions
     outputs = {}
     models = {
         "openai/whisper-tiny": pipe_whisper_tiny,
+        "openai/whisper-tiny.en": pipe_whisper_tiny_en,
+        "openai/whisper-base": pipe_whisper_base,
+        "openai/whisper-base.en": pipe_whisper_base_en,
+        "openai/whisper-medium": pipe_whisper_medium,
+        "openai/whisper-medium.en": pipe_whisper_medium_en,
+        "distil-whisper/distil-large-v3.5": pipe_distil_whisper_large,
         "facebook/wav2vec2-base-960h": pipe_wav2vec2_base_960h,
+        "facebook/wav2vec2-large-960h": pipe_wav2vec2_large_960h,
         "facebook/hubert-large-ls960-ft": pipe_hubert_large_ls960_ft,
     }
     transcribe_btn = gr.Button("Transcribe with All Models")
     gold_text = gr.Textbox(label="Reference (Gold Standard)")
     whisper_tiny_html = gr.HTML(label="Whisper Tiny")
+    whisper_tiny_en_html = gr.HTML(label="Whisper Tiny English")
+    whisper_base_html = gr.HTML(label="Whisper Base")
+    whisper_base_en_html = gr.HTML(label="Whisper Base English")
+    whisper_medium_html = gr.HTML(label="Whisper Medium")
+    whisper_medium_en_html = gr.HTML(label="Whisper Medium English")
+    distil_html = gr.HTML(label="Distil-Whisper Large")
+    wav2vec_base_html = gr.HTML(label="Wav2Vec2 Base")
+    wav2vec_large_html = gr.HTML(label="Wav2Vec2 Large")
     hubert_html = gr.HTML(label="HuBERT Large")
     transcribe_btn.click(
         inputs=[file_path_output],
         outputs=[
             gold_text,
             whisper_tiny_html,
+            whisper_tiny_en_html,
+            whisper_base_html,
+            whisper_base_en_html,
+            whisper_medium_html,
+            whisper_medium_en_html,
+            distil_html,
+            wav2vec_base_html,
+            wav2vec_large_html,
             hubert_html,
         ],
     )