Spaces:
Running
Running
File size: 2,904 Bytes
cd1309d c72d839 c10f1ac c72d839 cd1309d a4f48aa cd1309d c72d839 cd1309d c72d839 2477bc4 c72d839 2477bc4 c72d839 2477bc4 c72d839 2477bc4 c72d839 2477bc4 a4f48aa 7eff88c c72d839 7eff88c c72d839 7eff88c c72d839 7eff88c c72d839 bd261b0 c72d839 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
"""
Speech Recognition Module using Whisper Large-v3
Handles audio preprocessing and transcription
"""
import logging
import numpy as np
logger = logging.getLogger(__name__)
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
from pydub import AudioSegment
import soundfile as sf # Add this import
def transcribe_audio(audio_path):
"""
Convert audio file to text using Whisper ASR model
Args:
audio_path: Path to input audio file
Returns:
Transcribed English text
"""
logger.info(f"Starting transcription for: {audio_path}")
try:
# Audio conversion
logger.info("Converting audio format")
audio = AudioSegment.from_file(audio_path)
processed_audio = audio.set_frame_rate(16000).set_channels(1)
wav_path = audio_path.replace(".mp3", ".wav")
processed_audio.export(wav_path, format="wav")
logger.info(f"Audio converted to: {wav_path}")
# Model initialization
logger.info("Loading Whisper model")
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {device}")
model = AutoModelForSpeechSeq2Seq.from_pretrained(
"openai/whisper-large-v3",
torch_dtype=torch.float32,
low_cpu_mem_usage=True,
use_safetensors=True
).to(device)
processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")
logger.info("Model loaded successfully")
# Processing
logger.info("Processing audio input")
logger.debug("Loading audio data")
audio_data, sample_rate = sf.read(wav_path)
audio_data = audio_data.astype(np.float32)
# Increase chunk length and stride for longer transcriptions
inputs = processor(
audio_data,
sampling_rate=16000,
return_tensors="pt",
# Increase chunk length to handle longer segments
chunk_length_s=60, # Increased from 30
stride_length_s=10 # Increased from 5
).to(device)
# Transcription
logger.info("Generating transcription")
with torch.no_grad():
# Add max_length parameter to allow for longer outputs
outputs = model.generate(
**inputs,
language="en",
task="transcribe",
max_length=448, # Explicitly set max output length
no_repeat_ngram_size=3 # Prevent repetition in output
)
result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
logger.info(f"transcription: %s" % result)
logger.info(f"Transcription completed successfully")
return result
except Exception as e:
logger.error(f"Transcription failed: {str(e)}", exc_info=True)
raise |