whisper-gradio

Running on Zero

File size: 3,076 Bytes

973bb27

import gradio as gr
import whisper
import torch
import json
import spaces
from datetime import timedelta
import os
import zipfile
from pathlib import Path

def format_timestamp(seconds):
    """Convert seconds to SRT timestamp format"""
    td = timedelta(seconds=seconds)
    hours = td.seconds//3600
    minutes = (td.seconds//60)%60
    seconds = td.seconds%60
    milliseconds = td.microseconds//1000
    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"

def save_files(text, srt, json_data, base_name):
    """Save transcription in different formats and create zip"""
    # Create output directory if it doesn't exist
    output_dir = Path("transcriptions")
    output_dir.mkdir(exist_ok=True)
    
    # Generate filenames
    base_name = Path(base_name).stem
    txt_path = output_dir / f"{base_name}.txt"
    srt_path = output_dir / f"{base_name}.srt"
    json_path = output_dir / f"{base_name}.json"
    zip_path = output_dir / f"{base_name}_all.zip"
    
    # Save individual files
    txt_path.write_text(text)
    srt_path.write_text(srt)
    json_path.write_text(json_data)
    
    # Create ZIP file
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        zipf.write(txt_path, txt_path.name)
        zipf.write(srt_path, srt_path.name)
        zipf.write(json_path, json_path.name)
    
    return str(txt_path), str(srt_path), str(json_path), str(zip_path)

@spaces.GPU
def transcribe(audio_file):
    # Load the Whisper model
    model = whisper.load_model("large-v3-turbo")
    
    # Transcribe the audio
    result = model.transcribe(audio_file)
    
    # Format as plain text
    text_output = result["text"]
    
    # Format as JSON
    json_output = json.dumps(result, indent=2)
    
    # Format as SRT
    srt_output = ""
    for i, segment in enumerate(result["segments"], 1):
        start_time = format_timestamp(segment["start"])
        end_time = format_timestamp(segment["end"])
        text = segment["text"].strip()
        srt_output += f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
    
    # Save files and get paths
    txt_file, srt_file, json_file, zip_file = save_files(
        text_output, srt_output, json_output, 
        os.path.basename(audio_file)
    )
    
    return (
        txt_file, srt_file, json_file, zip_file, text_output, srt_output, json_output
    )

# Create the Gradio interface
demo = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath", label="Upload Audio"),
    outputs=[
        gr.File(label="Download TXT"),
        gr.File(label="Download SRT"),
        gr.File(label="Download JSON"),
        gr.File(label="Download All (ZIP)"),
        gr.Textbox(label="Transcription", lines=5),
        gr.Textbox(label="SRT Format"),
        gr.JSON(label="JSON Output")
    ],
    title="Audio Transcription with Whisper",
    description="Upload an audio file to transcribe it into text, SRT, and JSON formats using OpenAI's Whisper model. You can download the results in different formats or get everything in a ZIP file."
)

if __name__ == "__main__":
    demo.launch()