File size: 2,904 Bytes
cd1309d
 
 
 
 
c72d839
c10f1ac
c72d839
 
cd1309d
 
 
a4f48aa
cd1309d
 
 
 
 
 
 
 
 
c72d839
cd1309d
c72d839
 
2477bc4
c72d839
 
 
 
2477bc4
c72d839
 
 
 
2477bc4
c72d839
 
 
 
 
 
 
 
 
2477bc4
c72d839
 
2477bc4
a4f48aa
 
 
7eff88c
 
c72d839
7eff88c
c72d839
 
7eff88c
 
 
c72d839
 
 
 
 
7eff88c
 
 
 
 
 
 
 
c72d839
 
bd261b0
c72d839
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""
Speech Recognition Module using Whisper Large-v3
Handles audio preprocessing and transcription
"""

import logging
import numpy as np
logger = logging.getLogger(__name__)

import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
from pydub import AudioSegment
import soundfile as sf  # Add this import

def transcribe_audio(audio_path):
    """
    Convert audio file to text using Whisper ASR model
    Args:
        audio_path: Path to input audio file
    Returns:
        Transcribed English text
    """
    logger.info(f"Starting transcription for: {audio_path}")
    
    try:
        # Audio conversion
        logger.info("Converting audio format")
        audio = AudioSegment.from_file(audio_path)
        processed_audio = audio.set_frame_rate(16000).set_channels(1)
        wav_path = audio_path.replace(".mp3", ".wav")
        processed_audio.export(wav_path, format="wav")
        logger.info(f"Audio converted to: {wav_path}")

        # Model initialization
        logger.info("Loading Whisper model")
        device = "cuda" if torch.cuda.is_available() else "cpu"
        logger.info(f"Using device: {device}")

        model = AutoModelForSpeechSeq2Seq.from_pretrained(
            "openai/whisper-large-v3",
            torch_dtype=torch.float32,
            low_cpu_mem_usage=True,
            use_safetensors=True
        ).to(device)
        
        processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")
        logger.info("Model loaded successfully")

        # Processing
        logger.info("Processing audio input")
        logger.debug("Loading audio data")
        audio_data, sample_rate = sf.read(wav_path)
        audio_data = audio_data.astype(np.float32)
        
        # Increase chunk length and stride for longer transcriptions
        inputs = processor(
            audio_data,
            sampling_rate=16000,
            return_tensors="pt",
            # Increase chunk length to handle longer segments
            chunk_length_s=60,  # Increased from 30
            stride_length_s=10  # Increased from 5
        ).to(device)

        # Transcription
        logger.info("Generating transcription")
        with torch.no_grad():
            # Add max_length parameter to allow for longer outputs
            outputs = model.generate(
                **inputs, 
                language="en", 
                task="transcribe",
                max_length=448,  # Explicitly set max output length
                no_repeat_ngram_size=3  # Prevent repetition in output
            )
        
        result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
        logger.info(f"transcription: %s" % result)
        logger.info(f"Transcription completed successfully")
        return result

    except Exception as e:
        logger.error(f"Transcription failed: {str(e)}", exc_info=True)
        raise