Spaces:

jsbeaudry
/

whisper-medium-creole-oswald

Sleeping

File size: 2,055 Bytes

ad65f9d
 
 
0493d0d
5bf7330
ad65f9d
0493d0d
ad65f9d
 
 
 
 
 
 
0493d0d
5bf7330
 
1ee5c4a
 
0493d0d
5bf7330
ad65f9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0493d0d
 
 
 
ad65f9d
0493d0d
 
 
ad65f9d
1ee5c4a
 
 
0493d0d
 
 
bfbc50c
0493d0d

from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import librosa
import torch
import gradio as gr

# Load Whisper model and processor
print("Loading model...")
processor = AutoProcessor.from_pretrained("jsbeaudry/whisper-medium-oswald")
model = AutoModelForSpeechSeq2Seq.from_pretrained("jsbeaudry/whisper-medium-oswald")
model.eval()

# Set device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Model loaded successfully.")

# Transcription function
def transcribe(audio):
    if audio is None:
        return "Please upload or record an audio file first."

    # Gradio provides a tuple (sr, data)
    sr, data = audio

    # If stereo, convert to mono
    if len(data.shape) == 2:
        data = librosa.to_mono(data.T)

    # Resample to 16kHz if needed
    if sr != 16000:
        data = librosa.resample(data, orig_sr=sr, target_sr=16000)
        sr = 16000

    # Process audio
    input_features = processor(data, sampling_rate=sr, return_tensors="pt").input_features.to(device)

    # Predict
    with torch.no_grad():
        predicted_ids = model.generate(input_features)

    # Decode
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription

# Gradio UI
def create_interface():
    with gr.Blocks(title="Whisper Medium - Haitian Creole") as demo:
        gr.Markdown("# 🎙️ Whisper Medium Creole ASR")
        gr.Markdown(
            "Upload or record your voice in Haitian Creole. Then click **Transcribe** to get the text."
        )

        with gr.Row():
            audio_input = gr.Audio(label="🎧 Upload or Record Audio", type="numpy", format="wav")
            transcribe_button = gr.Button("🔍 Transcribe")
            output_text = gr.Textbox(label="📝 Transcribed Text", lines=4)

        transcribe_button.click(fn=transcribe, inputs=audio_input, outputs=output_text)

    return demo

if __name__ == "__main__":
    interface = create_interface()
    interface.launch()