Spaces:

tclin
/

Whisper-ATC-Transcriber

Running

App Files Files Community

tclin commited on Apr 16

Commit

722c2f4

verified ·

1 Parent(s): a0004b6

Create app.py

Browse files

Files changed (1) hide show

app.py +71 -0

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import gradio as gr
+import torch
+import torchaudio
+import numpy as np
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+# Model loading function with caching
+def load_model():
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+    model = WhisperForConditionalGeneration.from_pretrained("tclin/whisper-large-v3-turbo-atcosim-finetune")
+    model = model.to(device=device, dtype=torch_dtype)
+    processor = WhisperProcessor.from_pretrained("tclin/whisper-large-v3-turbo-atcosim-finetune")
+    return model, processor, device, torch_dtype
+# Load model and processor once at startup
+model, processor, device, torch_dtype = load_model()
+# Define the transcription function
+def transcribe_audio(audio_file):
+    # Check if audio file exists
+    if audio_file is None:
+        return "Please upload an audio file"
+    try:
+        # Load and preprocess audio
+        waveform, sample_rate = torchaudio.load(audio_file)
+        # Resample to 16kHz (required for Whisper models)
+        if sample_rate != 16000:
+            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
+            waveform = resampler(waveform)
+        # Convert stereo to mono if needed
+        if waveform.shape[0] > 1:
+            waveform = waveform.mean(dim=0, keepdim=True)
+        # Convert to numpy array
+        waveform_np = waveform.squeeze().cpu().numpy()
+        # Process with model
+        input_features = processor(waveform_np, sampling_rate=16000, return_tensors="pt").input_features
+        input_features = input_features.to(device=device, dtype=torch_dtype)
+        generated_ids = model.generate(input_features, max_new_tokens=128)
+        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        return transcription
+    except Exception as e:
+        return f"Error processing audio: {str(e)}"
+# Create Gradio interface
+demo = gr.Interface(
+    fn=transcribe_audio,
+    inputs=gr.Audio(type="filepath"),
+    outputs="text",
+    title="ATC Speech Transcription",
+    description="Upload an air traffic control audio file and get an accurate transcription using a Whisper model fine-tuned on the ATCOSIM dataset.",
+    examples=[
+        ["example1.wav"],
+        ["example2.wav"]
+    ],
+    article="This model is fine-tuned on the ATCOSIM dataset to accurately transcribe air traffic control communications with a Word Error Rate (WER) of 3.73%."
+)
+# Launch the interface
+if __name__ == "__main__":
+    demo.launch()