Spaces:

ankandrew
/

whisper

Running on Zero

File size: 4,435 Bytes

0ce1e8d

"""
Demo to run OpenAI Whisper using HuggingFace ZeroGPU.

This way we can test default Whisper models provided by OpenAI, for later comparison with fine-tuned ones.
"""

import subprocess
import tempfile
from pathlib import Path

import gradio as gr
import spaces
import torch
import whisper

YT_AUDIO_FORMAT = "bestaudio[ext=m4a]"


def download_youtube(url: str, tmp_dir: Path) -> Path:
    """Download the audio track from a YouTube video and return the local path."""
    out_path = tmp_dir / r"%\(id)s.%(ext)s"
    cmd = [
        "yt-dlp",
        "--quiet",
        "--no-warnings",
        "--extract-audio",
        "--audio-format",
        "m4a",
        "--audio-quality",
        "0",
        "-f",
        YT_AUDIO_FORMAT,
        "-o",
        str(out_path),
        url,
    ]
    result = subprocess.run(cmd, capture_output=True, check=True)
    if result.returncode != 0:
        raise RuntimeError(f"yt-dlp failed: {result.stderr.decode()}")

    files = list(tmp_dir.glob("*.m4a"))
    if not files:
        raise FileNotFoundError("Could not locate downloaded audio.")
    return files[0]


def _get_input_path(audio, youtube_url):
    if youtube_url and youtube_url.strip():
        with tempfile.TemporaryDirectory() as tmp:
            return download_youtube(youtube_url, Path(tmp))
    elif audio is not None:
        return audio
    else:
        raise gr.Error("Provide audio or a YouTube URL")


def make_results_table(results):
    rows = []
    for r in results:
        row = [r["model"], r["language"], r["text"]]
        rows.append(row)
    return rows


@spaces.GPU
def transcribe_audio(
    model_sizes: list[str],
    audio: str,
    youtube_url: str,
    return_timestamps: bool,
    temperature: float,
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    results = []
    for size in model_sizes:
        model = whisper.load_model(size, device=device)
        inp = _get_input_path(audio, youtube_url)
        out = model.transcribe(
            str(inp),
            word_timestamps=return_timestamps,
            temperature=temperature,
            verbose=False,
        )
        text = out["text"].strip()
        segments = out["segments"] if return_timestamps else []
        results.append(
            {
                "model": size,
                "language": out["language"],
                "text": text,
                "segments": segments,
            }
        )
    df_results = make_results_table(results, return_timestamps)
    return df_results


def build_demo() -> gr.Blocks:
    with gr.Blocks(title="🗣️ Whisper Transcription Demo (HF Spaces Zero-GPU)") as whisper_demo:
        gr.Markdown("""
        # Whisper Transcription Demo

        Run Whisper transcription on audio or YouTube video. Whisper is a general-purpose speech recognition model,
        trained on a large dataset
        """)

        with gr.Row():
            model_choices = gr.Dropdown(
                label="Model size(s)",
                choices=["tiny", "base", "small", "medium", "large", "turbo"],
                value=["turbo"],
                multiselect=True,
                allow_custom_value=False,
            )
            ts_checkbox = gr.Checkbox(
                label="Return word timestamps",
                interactive=False,
                value=False,
            )
            temp_slider = gr.Slider(
                label="Decoding temperature",
                minimum=0.0,
                maximum=1.0,
                value=0.0,
                step=0.01,
            )

        audio_input = gr.Audio(
            label="Upload or record audio",
            sources=["upload"],
            type="filepath",
        )

        yt_input = gr.Textbox(
            label="... or paste a YouTube URL (audio only)",
            placeholder="https://youtu.be/XYZ",
        )

        with gr.Row():
            transcribe_btn = gr.Button("Transcribe 🏁")

        out_table = gr.Dataframe(
            headers=["Model", "Language", "Transcript"],
            datatype=["str", "str", "str"],
            label="Transcription Results",
        )

        transcribe_btn.click(
            transcribe_audio,
            inputs=[model_choices, audio_input, yt_input, ts_checkbox, temp_slider],
            outputs=[out_table],
        )

    return whisper_demo


if __name__ == "__main__":
    demo = build_demo()
    demo.launch()