Spaces:
Running
Running
from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
import gradio as gr | |
from pydub import AudioSegment, silence | |
import tempfile | |
import torch | |
import torchaudio | |
MODEL_NAME = "dataprizma/whisper-large-v3-turbo" | |
processor = WhisperProcessor.from_pretrained(MODEL_NAME) | |
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME) | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model = model.to(device) | |
def split_on_silence_with_duration_control(audio, min_len, max_len, silence_thresh=-40): | |
silences = silence.detect_silence(audio, min_silence_len=500, silence_thresh=silence_thresh) | |
silences = [((start + end) // 2) for start, end in silences] | |
chunks = [] | |
start = 0 | |
while start < len(audio): | |
end = min(start + max_len, len(audio)) | |
candidates = [s for s in silences if start + min_len <= s <= end] | |
split_point = candidates[-1] if candidates else end | |
chunks.append(audio[start:split_point]) | |
start = split_point | |
return chunks | |
def transcribe(audio_file): | |
# Load audio using pydub | |
audio = AudioSegment.from_file(audio_file) | |
# Convert to mono and 16kHz if needed | |
if audio.channels > 1: | |
audio = audio.set_channels(1) | |
if audio.frame_rate != 16000: | |
audio = audio.set_frame_rate(16000) | |
# Detect silent chunks | |
chunks = split_on_silence_with_duration_control( | |
audio, min_len=15000, max_len=25000, silence_thresh=-40 | |
) | |
# Transcribe each chunk | |
results = [] | |
for chunk in chunks: | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmpfile: | |
chunk.export(tmpfile.name, format="wav") | |
waveform, _ = torchaudio.load(tmpfile.name) | |
input_features = processor( | |
waveform.squeeze().numpy(), | |
sampling_rate=16000, | |
return_tensors="pt", | |
language="uz" | |
).input_features.to(device) | |
with torch.no_grad(): | |
predicted_ids = model.generate(input_features) | |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] | |
results.append(transcription) | |
return " ".join(results) | |
demo = gr.Blocks() | |
file_transcribe = gr.Interface( | |
fn=transcribe, | |
inputs=gr.Audio(type="filepath", label="Audio file"), | |
outputs="text", | |
title="Whisper Large V3: Transcribe Audio", | |
description="Whisper Large V3 fine-tuned for Uzbek language by Dataprizma", | |
) | |
with demo: | |
gr.TabbedInterface([file_transcribe], ["Audio file"]) | |
demo.launch() | |