import librosa import numpy as np import torch from typing import Tuple import os import subprocess import tempfile from ..configs.config import AudioConfig class AudioProcessor: """音频处理器:负责音频的预处理和特征提取""" def __init__(self, config: AudioConfig): self.config = config self.target_length = 16000 # 固定音频长度为1秒 def load_audio(self, file_path: str) -> np.ndarray: """加载音频文件,支持多种格式包括m4a""" try: # 检查是否是m4a文件 if file_path.lower().endswith('.m4a'): # 创建临时wav文件 with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav: temp_wav_path = temp_wav.name try: # 使用ffmpeg将m4a转换为wav command = [ 'ffmpeg', '-i', file_path, '-acodec', 'pcm_s16le', '-ar', str(self.config.sample_rate), '-ac', '1', # 转换为单声道 '-y', # 覆盖已存在的文件 temp_wav_path ] subprocess.run(command, check=True, capture_output=True) # 加载转换后的wav文件 audio, _ = librosa.load(temp_wav_path, sr=self.config.sample_rate) finally: # 清理临时文件 if os.path.exists(temp_wav_path): os.unlink(temp_wav_path) else: # 直接加载其他格式的音频文件 audio, _ = librosa.load(file_path, sr=self.config.sample_rate) return audio except Exception as e: raise RuntimeError(f"Error loading audio file {file_path}: {str(e)}") def preprocess_audio(self, audio_path: str) -> Tuple[np.ndarray, np.ndarray]: """ 预处理音频文件: 1. 加载音频 2. 重采样到目标采样率 3. 将音频裁剪或填充到固定长度 4. 计算梅尔频谱图 """ # 加载音频 audio, sr = librosa.load(audio_path, sr=self.config.sample_rate) # 处理音频长度 if len(audio) > self.target_length: # 随机裁剪到目标长度 start = np.random.randint(0, len(audio) - self.target_length) audio = audio[start:start + self.target_length] else: # 填充到目标长度 padding = self.target_length - len(audio) audio = np.pad(audio, (0, padding), mode='constant') # 计算梅尔频谱图 mel_spec = librosa.feature.melspectrogram( y=audio, sr=self.config.sample_rate, n_fft=self.config.n_fft, hop_length=self.config.hop_length, win_length=self.config.win_length, n_mels=self.config.n_mels, fmin=self.config.mel_fmin, fmax=self.config.mel_fmax ) # 转换为分贝单位 mel_spec = librosa.power_to_db(mel_spec, ref=np.max) # 确保梅尔频谱图的维度正确:(n_mels, time) # 如果需要,调整时间维度到固定长度 target_time_steps = 32 # 固定时间步长 if mel_spec.shape[1] > target_time_steps: mel_spec = mel_spec[:, :target_time_steps] else: pad_width = ((0, 0), (0, target_time_steps - mel_spec.shape[1])) mel_spec = np.pad(mel_spec, pad_width, mode='constant') return audio, mel_spec def audio_to_mel_spec(self, audio: np.ndarray) -> np.ndarray: """将音频转换为梅尔频谱图""" # 计算短时傅里叶变换 stft = librosa.stft( audio, n_fft=self.config.n_fft, hop_length=self.config.hop_length, win_length=self.config.win_length ) # 计算幅度谱 magnitude = np.abs(stft) # 生成梅尔滤波器组 mel_basis = librosa.filters.mel( sr=self.config.sample_rate, n_fft=self.config.n_fft, n_mels=self.config.n_mels, fmin=self.config.mel_fmin, fmax=self.config.mel_fmax ) # 应用梅尔滤波器 mel_spec = np.dot(mel_basis, magnitude) # 转换为对数刻度 mel_spec = np.log(np.clip(mel_spec, a_min=1e-5, a_max=None)) return mel_spec def normalize_audio(self, audio: np.ndarray) -> np.ndarray: """音频归一化""" return audio / np.max(np.abs(audio)) def pad_or_trim(self, audio: np.ndarray, target_length: int) -> np.ndarray: """将音频填充或裁剪到指定长度""" if len(audio) > target_length: return audio[:target_length] else: return np.pad(audio, (0, target_length - len(audio)), mode='constant')