whisper / app.py
ankandrew's picture
Update app.py
bef4e11 verified
"""
Demo to run OpenAI Whisper using HuggingFace ZeroGPU.
This way we can test default Whisper models provided by OpenAI, for later comparison with fine-tuned ones.
"""
import subprocess
import tempfile
from pathlib import Path
import gradio as gr
import spaces
import torch
import whisper
YT_AUDIO_FORMAT = "bestaudio[ext=m4a]"
MODEL_SIZES = ["tiny", "base", "small", "medium", "large", "turbo"]
for size in MODEL_SIZES:
whisper.load_model(size, device="cpu")
def download_youtube(url: str, tmp_dir: Path) -> Path:
"""Download the audio track from a YouTube video and return the local path."""
out_path = tmp_dir / r"%\(id)s.%(ext)s"
cmd = [
"yt-dlp",
"--quiet",
"--no-warnings",
"--extract-audio",
"--audio-format",
"m4a",
"--audio-quality",
"0",
"-f",
YT_AUDIO_FORMAT,
"-o",
str(out_path),
url,
]
result = subprocess.run(cmd, capture_output=True, check=True)
if result.returncode != 0:
raise RuntimeError(f"yt-dlp failed: {result.stderr.decode()}")
files = list(tmp_dir.glob("*.m4a"))
if not files:
raise FileNotFoundError("Could not locate downloaded audio.")
return files[0]
def _get_input_path(audio, youtube_url):
if youtube_url and youtube_url.strip():
with tempfile.TemporaryDirectory() as tmp:
return download_youtube(youtube_url, Path(tmp))
elif audio is not None:
return audio
else:
raise gr.Error("Provide audio or a YouTube URL")
def make_results_table(results):
rows = []
for r in results:
row = [r["model"], r["language"], r["text"]]
rows.append(row)
return rows
@spaces.GPU
def transcribe_audio(
model_sizes: list[str],
audio: str,
youtube_url: str,
return_timestamps: bool,
temperature: float,
logprob_threshold: float = -1.0,
no_speech_threshold: float = 0.6,
compression_ratio_threshold: float = 2.4,
):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
results = []
for size in model_sizes:
model = whisper.load_model(size, device=device)
inp = _get_input_path(audio, youtube_url)
out = model.transcribe(
str(inp),
word_timestamps=return_timestamps,
temperature=temperature,
verbose=False,
logprob_threshold=logprob_threshold,
no_speech_threshold=no_speech_threshold,
compression_ratio_threshold=compression_ratio_threshold,
)
text = out["text"].strip()
segments = out["segments"] if return_timestamps else []
results.append(
{
"model": size,
"language": out["language"],
"text": text,
"segments": segments,
}
)
df_results = make_results_table(results)
return df_results
def build_demo() -> gr.Blocks:
with gr.Blocks(title="🗣️ Whisper Transcription Demo (HF Spaces Zero-GPU)") as whisper_demo:
gr.Markdown("""
# Whisper Transcription Demo
Run Whisper transcription on audio or YouTube video. Whisper is a general-purpose speech recognition model,
trained on a large dataset
""")
with gr.Row():
model_choices = gr.Dropdown(
label="Model size(s)",
choices=MODEL_SIZES,
value=["turbo"],
multiselect=True,
allow_custom_value=False,
)
ts_checkbox = gr.Checkbox(
label="Return word timestamps",
interactive=False,
value=False,
)
temp_slider = gr.Slider(
label="Decoding temperature",
minimum=0.0,
maximum=1.0,
value=0.0,
step=0.01,
)
logprob_slider = gr.Slider(
label="Average log-probability threshold",
minimum=-10.0,
maximum=0.0,
value=-1.0,
step=0.1,
)
no_speech_slider = gr.Slider(
label="No-speech probability threshold",
minimum=0.0,
maximum=1.0,
value=0.6,
step=0.01,
)
compression_slider = gr.Slider(
label="Compression ratio threshold",
minimum=1.0,
maximum=5.0,
value=2.4,
step=0.1,
)
audio_input = gr.Audio(
label="Upload or record audio",
sources=["upload"],
type="filepath",
)
yt_input = gr.Textbox(
label="... or paste a YouTube URL (audio only)",
placeholder="https://youtu.be/XYZ",
)
with gr.Row():
transcribe_btn = gr.Button("Transcribe 🏁")
out_table = gr.Dataframe(
headers=["Model", "Language", "Transcript"],
datatype=["str", "str", "str"],
label="Transcription Results",
)
transcribe_btn.click(
transcribe_audio,
inputs=[
model_choices,
audio_input,
yt_input,
ts_checkbox,
temp_slider,
logprob_slider,
no_speech_slider,
compression_slider,
],
outputs=[out_table],
)
return whisper_demo
if __name__ == "__main__":
demo = build_demo()
demo.launch()