Spaces:
Running
Running
File size: 2,446 Bytes
cd1309d c72d839 cd1309d a4f48aa cd1309d c72d839 cd1309d c72d839 2477bc4 c72d839 2477bc4 c72d839 2477bc4 c72d839 2477bc4 c72d839 2477bc4 a4f48aa c72d839 a4f48aa c72d839 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
"""
Speech Recognition Module using Whisper Large-v3
Handles audio preprocessing and transcription
"""
import logging
logger = logging.getLogger(__name__)
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
from pydub import AudioSegment
import soundfile as sf # Add this import
def transcribe_audio(audio_path):
"""
Convert audio file to text using Whisper ASR model
Args:
audio_path: Path to input audio file
Returns:
Transcribed English text
"""
logger.info(f"Starting transcription for: {audio_path}")
try:
# Audio conversion
logger.info("Converting audio format")
audio = AudioSegment.from_file(audio_path)
processed_audio = audio.set_frame_rate(16000).set_channels(1)
wav_path = audio_path.replace(".mp3", ".wav")
processed_audio.export(wav_path, format="wav")
logger.info(f"Audio converted to: {wav_path}")
# Model initialization
logger.info("Loading Whisper model")
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {device}")
model = AutoModelForSpeechSeq2Seq.from_pretrained(
"openai/whisper-large-v3",
torch_dtype=torch.float32,
low_cpu_mem_usage=True,
use_safetensors=True
).to(device)
processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")
logger.info("Model loaded successfully")
# Processing
logger.info("Processing audio input")
logger.debug("Loading audio data")
audio_data, sample_rate = sf.read(wav_path)
audio_data = audio_data.astype(np.float32)
inputs = processor(
audio_data, # Pass audio array instead of path
sampling_rate=16000,
return_tensors="pt",
truncation=True,
chunk_length_s=30,
stride_length_s=5
).to(device)
# Transcription
logger.info("Generating transcription")
with torch.no_grad():
outputs = model.generate(**inputs, language="en", task="transcribe")
result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
logger.info(f"Transcription completed successfully")
return result
except Exception as e:
logger.error(f"Transcription failed: {str(e)}", exc_info=True)
raise |