Spaces:

sanchit-gandhi
/

whisper-language-id

Runtime error

sanchit-gandhi commited on Feb 22, 2023

Commit

9f7a693

1 Parent(s): 9e87dbe

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import torch
 from transformers import WhisperForConditionalGeneration, WhisperProcessor
 from transformers.models.whisper.tokenization_whisper import LANGUAGES
 from transformers.pipelines.audio_utils import ffmpeg_read
 import gradio as gr
@@ -43,18 +45,20 @@ def transcribe(Microphone, File_Upload):
     audio_data = process_audio_file(file)
-    input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
     with torch.no_grad():
         logits = model.forward(input_features, decoder_input_ids=decoder_input_ids).logits
     pred_ids = torch.argmax(logits, dim=-1)
     lang_ids = processor.decode(pred_ids[0])
     lang_ids = lang_ids.lstrip("<|").rstrip("|>")
-    language = LANGUAGES[lang_ids]
-    return language
 iface = gr.Interface(
@@ -63,7 +67,10 @@ iface = gr.Interface(
         gr.inputs.Audio(source="microphone", type='filepath', optional=True),
         gr.inputs.Audio(source="upload", type='filepath', optional=True),
     ],
-    outputs="text",
     layout="horizontal",
     theme="huggingface",
     title="Whisper Language Identification",

 import torch
+import torch.nn.functional as F
 from transformers import WhisperForConditionalGeneration, WhisperProcessor
 from transformers.models.whisper.tokenization_whisper import LANGUAGES
 from transformers.pipelines.audio_utils import ffmpeg_read
+import librosa
 import gradio as gr
     audio_data = process_audio_file(file)
+    input_features = processor(audio_data, return_tensors="pt").input_features
     with torch.no_grad():
         logits = model.forward(input_features, decoder_input_ids=decoder_input_ids).logits
     pred_ids = torch.argmax(logits, dim=-1)
+    probability = F.softmax(logits, dim=-1).max()
     lang_ids = processor.decode(pred_ids[0])
     lang_ids = lang_ids.lstrip("<|").rstrip("|>")
+    language = LANGUAGES.get(lang_ids, "not detected")
+    return language.capitalize(), probability.cpu().numpy()
 iface = gr.Interface(
         gr.inputs.Audio(source="microphone", type='filepath', optional=True),
         gr.inputs.Audio(source="upload", type='filepath', optional=True),
     ],
+    outputs=[
+        gr.outputs.Textbox(label="Language"),
+        gr.Number(label="Probability"),
+    ],
     layout="horizontal",
     theme="huggingface",
     title="Whisper Language Identification",