File size: 2,055 Bytes
ad65f9d 0493d0d 5bf7330 ad65f9d 0493d0d ad65f9d 0493d0d 5bf7330 1ee5c4a 0493d0d 5bf7330 ad65f9d 0493d0d ad65f9d 0493d0d ad65f9d 1ee5c4a 0493d0d bfbc50c 0493d0d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import librosa
import torch
import gradio as gr
# Load Whisper model and processor
print("Loading model...")
processor = AutoProcessor.from_pretrained("jsbeaudry/whisper-medium-oswald")
model = AutoModelForSpeechSeq2Seq.from_pretrained("jsbeaudry/whisper-medium-oswald")
model.eval()
# Set device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Model loaded successfully.")
# Transcription function
def transcribe(audio):
if audio is None:
return "Please upload or record an audio file first."
# Gradio provides a tuple (sr, data)
sr, data = audio
# If stereo, convert to mono
if len(data.shape) == 2:
data = librosa.to_mono(data.T)
# Resample to 16kHz if needed
if sr != 16000:
data = librosa.resample(data, orig_sr=sr, target_sr=16000)
sr = 16000
# Process audio
input_features = processor(data, sampling_rate=sr, return_tensors="pt").input_features.to(device)
# Predict
with torch.no_grad():
predicted_ids = model.generate(input_features)
# Decode
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
return transcription
# Gradio UI
def create_interface():
with gr.Blocks(title="Whisper Medium - Haitian Creole") as demo:
gr.Markdown("# ποΈ Whisper Medium Creole ASR")
gr.Markdown(
"Upload or record your voice in Haitian Creole. Then click **Transcribe** to get the text."
)
with gr.Row():
audio_input = gr.Audio(label="π§ Upload or Record Audio", type="numpy", format="wav")
transcribe_button = gr.Button("π Transcribe")
output_text = gr.Textbox(label="π Transcribed Text", lines=4)
transcribe_button.click(fn=transcribe, inputs=audio_input, outputs=output_text)
return demo
if __name__ == "__main__":
interface = create_interface()
interface.launch()
|