Spaces:

ankandrew
/

whisper

Running on Zero

File size: 5,592 Bytes

"""
Demo to run OpenAI Whisper using HuggingFace ZeroGPU.

This way we can test default Whisper models provided by OpenAI, for later comparison with fine-tuned ones.
"""

import subprocess
import tempfile
from pathlib import Path

import gradio as gr
import spaces
import torch
import whisper

YT_AUDIO_FORMAT = "bestaudio[ext=m4a]"


MODEL_SIZES = ["tiny", "base", "small", "medium", "large", "turbo"]
for size in MODEL_SIZES:
    whisper.load_model(size, device="cpu")


def download_youtube(url: str, tmp_dir: Path) -> Path:
    """Download the audio track from a YouTube video and return the local path."""
    out_path = tmp_dir / r"%\(id)s.%(ext)s"
    cmd = [
        "yt-dlp",
        "--quiet",
        "--no-warnings",
        "--extract-audio",
        "--audio-format",
        "m4a",
        "--audio-quality",
        "0",
        "-f",
        YT_AUDIO_FORMAT,
        "-o",
        str(out_path),
        url,
    ]
    result = subprocess.run(cmd, capture_output=True, check=True)
    if result.returncode != 0:
        raise RuntimeError(f"yt-dlp failed: {result.stderr.decode()}")

    files = list(tmp_dir.glob("*.m4a"))
    if not files:
        raise FileNotFoundError("Could not locate downloaded audio.")
    return files[0]


def _get_input_path(audio, youtube_url):
    if youtube_url and youtube_url.strip():
        with tempfile.TemporaryDirectory() as tmp:
            return download_youtube(youtube_url, Path(tmp))
    elif audio is not None:
        return audio
    else:
        raise gr.Error("Provide audio or a YouTube URL")


def make_results_table(results):
    rows = []
    for r in results:
        row = [r["model"], r["language"], r["text"]]
        rows.append(row)
    return rows


@spaces.GPU
def transcribe_audio(
    model_sizes: list[str],
    audio: str,
    youtube_url: str,
    return_timestamps: bool,
    temperature: float,
    logprob_threshold: float = -1.0,
    no_speech_threshold: float = 0.6,
    compression_ratio_threshold: float = 2.4,
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    results = []
    for size in model_sizes:
        model = whisper.load_model(size, device=device)
        inp = _get_input_path(audio, youtube_url)
        out = model.transcribe(
            str(inp),
            word_timestamps=return_timestamps,
            temperature=temperature,
            verbose=False,
            logprob_threshold=logprob_threshold,
            no_speech_threshold=no_speech_threshold,
            compression_ratio_threshold=compression_ratio_threshold,
        )
        text = out["text"].strip()
        segments = out["segments"] if return_timestamps else []
        results.append(
            {
                "model": size,
                "language": out["language"],
                "text": text,
                "segments": segments,
            }
        )
    df_results = make_results_table(results)
    return df_results


def build_demo() -> gr.Blocks:
    with gr.Blocks(title="🗣️ Whisper Transcription Demo (HF Spaces Zero-GPU)") as whisper_demo:
        gr.Markdown("""
        # Whisper Transcription Demo

        Run Whisper transcription on audio or YouTube video. Whisper is a general-purpose speech recognition model,
        trained on a large dataset
        """)

        with gr.Row():
            model_choices = gr.Dropdown(
                label="Model size(s)",
                choices=MODEL_SIZES,
                value=["turbo"],
                multiselect=True,
                allow_custom_value=False,
            )
            ts_checkbox = gr.Checkbox(
                label="Return word timestamps",
                interactive=False,
                value=False,
            )
            temp_slider = gr.Slider(
                label="Decoding temperature",
                minimum=0.0,
                maximum=1.0,
                value=0.0,
                step=0.01,
            )

        logprob_slider = gr.Slider(
            label="Average log-probability threshold",
            minimum=-10.0,
            maximum=0.0,
            value=-1.0,
            step=0.1,
        )
        no_speech_slider = gr.Slider(
            label="No-speech probability threshold",
            minimum=0.0,
            maximum=1.0,
            value=0.6,
            step=0.01,
        )
        compression_slider = gr.Slider(
            label="Compression ratio threshold",
            minimum=1.0,
            maximum=5.0,
            value=2.4,
            step=0.1,
        )

        audio_input = gr.Audio(
            label="Upload or record audio",
            sources=["upload"],
            type="filepath",
        )

        yt_input = gr.Textbox(
            label="... or paste a YouTube URL (audio only)",
            placeholder="https://youtu.be/XYZ",
        )

        with gr.Row():
            transcribe_btn = gr.Button("Transcribe 🏁")

        out_table = gr.Dataframe(
            headers=["Model", "Language", "Transcript"],
            datatype=["str", "str", "str"],
            label="Transcription Results",
        )

        transcribe_btn.click(
            transcribe_audio,
            inputs=[
                model_choices,
                audio_input,
                yt_input,
                ts_checkbox,
                temp_slider,
                logprob_slider,
                no_speech_slider,
                compression_slider,
            ],
            outputs=[out_table],
        )

    return whisper_demo


if __name__ == "__main__":
    demo = build_demo()
    demo.launch()