Spaces:

ankandrew
/

whisper

Running on Zero

App Files Files Community

ankandrew commited on 22 days ago

Commit

bef4e11

verified ·

1 Parent(s): b8e512a

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -1

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ import whisper
 YT_AUDIO_FORMAT = "bestaudio[ext=m4a]"
 MODEL_SIZES = ["tiny", "base", "small", "medium", "large", "turbo"]
 for size in MODEL_SIZES:
     whisper.load_model(size, device="cpu")
@@ -73,6 +74,9 @@ def transcribe_audio(
     youtube_url: str,
     return_timestamps: bool,
     temperature: float,
 ):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     results = []
@@ -84,6 +88,9 @@ def transcribe_audio(
             word_timestamps=return_timestamps,
             temperature=temperature,
             verbose=False,
         )
         text = out["text"].strip()
         segments = out["segments"] if return_timestamps else []
@@ -129,6 +136,28 @@ def build_demo() -> gr.Blocks:
                 step=0.01,
             )
         audio_input = gr.Audio(
             label="Upload or record audio",
             sources=["upload"],
@@ -151,7 +180,16 @@ def build_demo() -> gr.Blocks:
         transcribe_btn.click(
             transcribe_audio,
-            inputs=[model_choices, audio_input, yt_input, ts_checkbox, temp_slider],
             outputs=[out_table],
         )

 YT_AUDIO_FORMAT = "bestaudio[ext=m4a]"
 MODEL_SIZES = ["tiny", "base", "small", "medium", "large", "turbo"]
 for size in MODEL_SIZES:
     whisper.load_model(size, device="cpu")
     youtube_url: str,
     return_timestamps: bool,
     temperature: float,
+    logprob_threshold: float = -1.0,
+    no_speech_threshold: float = 0.6,
+    compression_ratio_threshold: float = 2.4,
 ):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     results = []
             word_timestamps=return_timestamps,
             temperature=temperature,
             verbose=False,
+            logprob_threshold=logprob_threshold,
+            no_speech_threshold=no_speech_threshold,
+            compression_ratio_threshold=compression_ratio_threshold,
         )
         text = out["text"].strip()
         segments = out["segments"] if return_timestamps else []
                 step=0.01,
             )
+        logprob_slider = gr.Slider(
+            label="Average log-probability threshold",
+            minimum=-10.0,
+            maximum=0.0,
+            value=-1.0,
+            step=0.1,
+        )
+        no_speech_slider = gr.Slider(
+            label="No-speech probability threshold",
+            minimum=0.0,
+            maximum=1.0,
+            value=0.6,
+            step=0.01,
+        )
+        compression_slider = gr.Slider(
+            label="Compression ratio threshold",
+            minimum=1.0,
+            maximum=5.0,
+            value=2.4,
+            step=0.1,
+        )
         audio_input = gr.Audio(
             label="Upload or record audio",
             sources=["upload"],
         transcribe_btn.click(
             transcribe_audio,
+            inputs=[
+                model_choices,
+                audio_input,
+                yt_input,
+                ts_checkbox,
+                temp_slider,
+                logprob_slider,
+                no_speech_slider,
+                compression_slider,
+            ],
             outputs=[out_table],
         )