Spaces:

KDM999
/

asr-multimodel-comparison

Running

App Files Files Community

KDM999 commited on about 1 month ago

Commit

1f97be9

verified ·

1 Parent(s): da0f868

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -42

app.py CHANGED Viewed

@@ -1,15 +1,13 @@
 import gradio as gr
 import spaces
-import torch
-from accelerate import init_empty_weights
 import random
 import json
 from difflib import SequenceMatcher
 from jiwer import wer
 import torchaudio
 from transformers import pipeline
-import os
-import string
 # Load metadata
 with open("common_voice_en_validated_249_hf_ready.json") as f:
@@ -20,19 +18,7 @@ ages = sorted(set(entry["age"] for entry in data))
 genders = sorted(set(entry["gender"] for entry in data))
 accents = sorted(set(entry["accent"] for entry in data))
-# Load ASR pipelines
-pipe_whisper_tiny = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
-pipe_whisper_tiny_en = pipeline("automatic-speech-recognition", model="openai/whisper-tiny.en")
-pipe_whisper_base = pipeline("automatic-speech-recognition", model="openai/whisper-base")
-pipe_whisper_base_en = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
-pipe_whisper_medium = pipeline("automatic-speech-recognition", model="openai/whisper-medium")
-pipe_whisper_medium_en = pipeline("automatic-speech-recognition", model="openai/whisper-medium.en")
-pipe_distil_whisper_large = pipeline("automatic-speech-recognition", model="distil-whisper/distil-large-v3.5")
-pipe_wav2vec2_base_960h = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
-pipe_wav2vec2_large_960h = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-960h")
-pipe_hubert_large_ls960_ft = pipeline("automatic-speech-recognition", model="facebook/hubert-large-ls960-ft")
-# Functions
 def convert_to_wav(file_path):
     wav_path = file_path.replace(".mp3", ".wav")
     if not os.path.exists(wav_path):
@@ -41,10 +27,6 @@ def convert_to_wav(file_path):
         torchaudio.save(wav_path, waveform, sample_rate)
     return wav_path
-def transcribe(pipe, file_path):
-    result = pipe(file_path)
-    return result["text"].strip().lower()
 def highlight_differences(ref, hyp):
     sm = SequenceMatcher(None, ref.split(), hyp.split())
     result = []
@@ -74,7 +56,7 @@ def generate_audio(age, gender, accent):
     wav_file_path = convert_to_wav(file_path)
     return wav_file_path, wav_file_path
-# Transcribe & Compare
 @spaces.GPU
 def transcribe_audio(file_path):
     if not file_path:
@@ -89,29 +71,33 @@ def transcribe_audio(file_path):
     if not gold:
         return "Reference not found.", "", "", "", "", "", ""
     outputs = {}
-    models = {
-        "openai/whisper-tiny": pipe_whisper_tiny,
-        "openai/whisper-tiny.en": pipe_whisper_tiny_en,
-        "openai/whisper-base": pipe_whisper_base,
-        "openai/whisper-base.en": pipe_whisper_base_en,
-        "openai/whisper-medium": pipe_whisper_medium,
-        "openai/whisper-medium.en": pipe_whisper_medium_en,
-        "distil-whisper/distil-large-v3.5": pipe_distil_whisper_large,
-        "facebook/wav2vec2-base-960h": pipe_wav2vec2_base_960h,
-        "facebook/wav2vec2-large-960h": pipe_wav2vec2_large_960h,
-        "facebook/hubert-large-ls960-ft": pipe_hubert_large_ls960_ft,
-    }
-    for name, model in models.items():
-        text = transcribe(model, file_path)
-        clean = normalize(text)
-        wer_score = wer(gold, clean)
-        outputs[name] = f"<b>{name} (WER: {wer_score:.2f}):</b><br>{highlight_differences(gold, clean)}"
     return (gold, *outputs.values())
-# Gradio Interface
 with gr.Blocks() as demo:
     gr.Markdown("# Comparing ASR Models on Diverse English Speech Samples")
     gr.Markdown("""
@@ -119,7 +105,7 @@ with gr.Blocks() as demo:
         Users can select age, gender, and accent to generate diverse English audio samples.
         The models are evaluated on their ability to transcribe those samples.
         Data is sourced from 249 validated entries in the Common Voice English Delta Segment 21.0 release.
-        """)
     with gr.Row():
         age = gr.Dropdown(choices=ages, label="Age")

 import gradio as gr
 import spaces
 import random
 import json
+import os
+import string
 from difflib import SequenceMatcher
 from jiwer import wer
 import torchaudio
 from transformers import pipeline
 # Load metadata
 with open("common_voice_en_validated_249_hf_ready.json") as f:
 genders = sorted(set(entry["gender"] for entry in data))
 accents = sorted(set(entry["accent"] for entry in data))
+# Utility functions
 def convert_to_wav(file_path):
     wav_path = file_path.replace(".mp3", ".wav")
     if not os.path.exists(wav_path):
         torchaudio.save(wav_path, waveform, sample_rate)
     return wav_path
 def highlight_differences(ref, hyp):
     sm = SequenceMatcher(None, ref.split(), hyp.split())
     result = []
     wav_file_path = convert_to_wav(file_path)
     return wav_file_path, wav_file_path
+# Transcribe & Compare (GPU Decorated)
 @spaces.GPU
 def transcribe_audio(file_path):
     if not file_path:
     if not gold:
         return "Reference not found.", "", "", "", "", "", ""
+    model_ids = [
+        "openai/whisper-tiny",
+        "openai/whisper-tiny.en",
+        "openai/whisper-base",
+        "openai/whisper-base.en",
+        "openai/whisper-medium",
+        "openai/whisper-medium.en",
+        "distil-whisper/distil-large-v3.5",
+        "facebook/wav2vec2-base-960h",
+        "facebook/wav2vec2-large-960h",
+        "facebook/hubert-large-ls960-ft",
+    ]
     outputs = {}
+    for model_id in model_ids:
+        try:
+            pipe = pipeline("automatic-speech-recognition", model=model_id)
+            text = pipe(file_path)["text"].strip().lower()
+            clean = normalize(text)
+            wer_score = wer(gold, clean)
+            outputs[model_id] = f"<b>{model_id} (WER: {wer_score:.2f}):</b><br>{highlight_differences(gold, clean)}"
+        except Exception as e:
+            outputs[model_id] = f"<b>{model_id}:</b><br><span style='color:red'>Error: {str(e)}</span>"
     return (gold, *outputs.values())
+# Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("# Comparing ASR Models on Diverse English Speech Samples")
     gr.Markdown("""
         Users can select age, gender, and accent to generate diverse English audio samples.
         The models are evaluated on their ability to transcribe those samples.
         Data is sourced from 249 validated entries in the Common Voice English Delta Segment 21.0 release.
+    """)
     with gr.Row():
         age = gr.Dropdown(choices=ages, label="Age")