Spaces:

ankandrew
/

whisper

Running on Zero

App Files Files Community

ankandrew commited on 21 days ago

Commit

0ce1e8d

verified ·

1 Parent(s): 6cc4b64

Create app.py

Browse files

Files changed (1) hide show

app.py +159 -0

app.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""
+Demo to run OpenAI Whisper using HuggingFace ZeroGPU.
+This way we can test default Whisper models provided by OpenAI, for later comparison with fine-tuned ones.
+"""
+import subprocess
+import tempfile
+from pathlib import Path
+import gradio as gr
+import spaces
+import torch
+import whisper
+YT_AUDIO_FORMAT = "bestaudio[ext=m4a]"
+def download_youtube(url: str, tmp_dir: Path) -> Path:
+    """Download the audio track from a YouTube video and return the local path."""
+    out_path = tmp_dir / r"%\(id)s.%(ext)s"
+    cmd = [
+        "yt-dlp",
+        "--quiet",
+        "--no-warnings",
+        "--extract-audio",
+        "--audio-format",
+        "m4a",
+        "--audio-quality",
+        "0",
+        "-f",
+        YT_AUDIO_FORMAT,
+        "-o",
+        str(out_path),
+        url,
+    ]
+    result = subprocess.run(cmd, capture_output=True, check=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"yt-dlp failed: {result.stderr.decode()}")
+    files = list(tmp_dir.glob("*.m4a"))
+    if not files:
+        raise FileNotFoundError("Could not locate downloaded audio.")
+    return files[0]
+def _get_input_path(audio, youtube_url):
+    if youtube_url and youtube_url.strip():
+        with tempfile.TemporaryDirectory() as tmp:
+            return download_youtube(youtube_url, Path(tmp))
+    elif audio is not None:
+        return audio
+    else:
+        raise gr.Error("Provide audio or a YouTube URL")
+def make_results_table(results):
+    rows = []
+    for r in results:
+        row = [r["model"], r["language"], r["text"]]
+        rows.append(row)
+    return rows
+@spaces.GPU
+def transcribe_audio(
+    model_sizes: list[str],
+    audio: str,
+    youtube_url: str,
+    return_timestamps: bool,
+    temperature: float,
+):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    results = []
+    for size in model_sizes:
+        model = whisper.load_model(size, device=device)
+        inp = _get_input_path(audio, youtube_url)
+        out = model.transcribe(
+            str(inp),
+            word_timestamps=return_timestamps,
+            temperature=temperature,
+            verbose=False,
+        )
+        text = out["text"].strip()
+        segments = out["segments"] if return_timestamps else []
+        results.append(
+            {
+                "model": size,
+                "language": out["language"],
+                "text": text,
+                "segments": segments,
+            }
+        )
+    df_results = make_results_table(results, return_timestamps)
+    return df_results
+def build_demo() -> gr.Blocks:
+    with gr.Blocks(title="🗣️ Whisper Transcription Demo (HF Spaces Zero-GPU)") as whisper_demo:
+        gr.Markdown("""
+        # Whisper Transcription Demo
+        Run Whisper transcription on audio or YouTube video. Whisper is a general-purpose speech recognition model,
+        trained on a large dataset
+        """)
+        with gr.Row():
+            model_choices = gr.Dropdown(
+                label="Model size(s)",
+                choices=["tiny", "base", "small", "medium", "large", "turbo"],
+                value=["turbo"],
+                multiselect=True,
+                allow_custom_value=False,
+            )
+            ts_checkbox = gr.Checkbox(
+                label="Return word timestamps",
+                interactive=False,
+                value=False,
+            )
+            temp_slider = gr.Slider(
+                label="Decoding temperature",
+                minimum=0.0,
+                maximum=1.0,
+                value=0.0,
+                step=0.01,
+            )
+        audio_input = gr.Audio(
+            label="Upload or record audio",
+            sources=["upload"],
+            type="filepath",
+        )
+        yt_input = gr.Textbox(
+            label="... or paste a YouTube URL (audio only)",
+            placeholder="https://youtu.be/XYZ",
+        )
+        with gr.Row():
+            transcribe_btn = gr.Button("Transcribe 🏁")
+        out_table = gr.Dataframe(
+            headers=["Model", "Language", "Transcript"],
+            datatype=["str", "str", "str"],
+            label="Transcription Results",
+        )
+        transcribe_btn.click(
+            transcribe_audio,
+            inputs=[model_choices, audio_input, yt_input, ts_checkbox, temp_slider],
+            outputs=[out_table],
+        )
+    return whisper_demo
+if __name__ == "__main__":
+    demo = build_demo()
+    demo.launch()