whisper-asr-uz / app.py
mrmuminov's picture
Refactor app.py to import additional audio processing libraries (pydub and tempfile)
4311f84
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import gradio as gr
from pydub import AudioSegment, silence
import tempfile
import torch
import torchaudio
MODEL_NAME = "dataprizma/whisper-large-v3-turbo"
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
def split_on_silence_with_duration_control(audio, min_len, max_len, silence_thresh=-40):
silences = silence.detect_silence(audio, min_silence_len=500, silence_thresh=silence_thresh)
silences = [((start + end) // 2) for start, end in silences]
chunks = []
start = 0
while start < len(audio):
end = min(start + max_len, len(audio))
candidates = [s for s in silences if start + min_len <= s <= end]
split_point = candidates[-1] if candidates else end
chunks.append(audio[start:split_point])
start = split_point
return chunks
def transcribe(audio_file):
# Load audio using pydub
audio = AudioSegment.from_file(audio_file)
# Convert to mono and 16kHz if needed
if audio.channels > 1:
audio = audio.set_channels(1)
if audio.frame_rate != 16000:
audio = audio.set_frame_rate(16000)
# Detect silent chunks
chunks = split_on_silence_with_duration_control(
audio, min_len=15000, max_len=25000, silence_thresh=-40
)
# Transcribe each chunk
results = []
for chunk in chunks:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmpfile:
chunk.export(tmpfile.name, format="wav")
waveform, _ = torchaudio.load(tmpfile.name)
input_features = processor(
waveform.squeeze().numpy(),
sampling_rate=16000,
return_tensors="pt",
language="uz"
).input_features.to(device)
with torch.no_grad():
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
results.append(transcription)
return " ".join(results)
demo = gr.Blocks()
file_transcribe = gr.Interface(
fn=transcribe,
inputs=gr.Audio(type="filepath", label="Audio file"),
outputs="text",
title="Whisper Large V3: Transcribe Audio",
description="Whisper Large V3 fine-tuned for Uzbek language by Dataprizma",
)
with demo:
gr.TabbedInterface([file_transcribe], ["Audio file"])
demo.launch()