Spaces:
Sleeping
Sleeping
import librosa | |
import numpy as np | |
import torch | |
from typing import Tuple | |
import os | |
import subprocess | |
import tempfile | |
from ..configs.config import AudioConfig | |
class AudioProcessor: | |
"""音频处理器:负责音频的预处理和特征提取""" | |
def __init__(self, config: AudioConfig): | |
self.config = config | |
self.target_length = 16000 # 固定音频长度为1秒 | |
def load_audio(self, file_path: str) -> np.ndarray: | |
"""加载音频文件,支持多种格式包括m4a""" | |
try: | |
# 检查是否是m4a文件 | |
if file_path.lower().endswith('.m4a'): | |
# 创建临时wav文件 | |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav: | |
temp_wav_path = temp_wav.name | |
try: | |
# 使用ffmpeg将m4a转换为wav | |
command = [ | |
'ffmpeg', | |
'-i', file_path, | |
'-acodec', 'pcm_s16le', | |
'-ar', str(self.config.sample_rate), | |
'-ac', '1', # 转换为单声道 | |
'-y', # 覆盖已存在的文件 | |
temp_wav_path | |
] | |
subprocess.run(command, check=True, capture_output=True) | |
# 加载转换后的wav文件 | |
audio, _ = librosa.load(temp_wav_path, sr=self.config.sample_rate) | |
finally: | |
# 清理临时文件 | |
if os.path.exists(temp_wav_path): | |
os.unlink(temp_wav_path) | |
else: | |
# 直接加载其他格式的音频文件 | |
audio, _ = librosa.load(file_path, sr=self.config.sample_rate) | |
return audio | |
except Exception as e: | |
raise RuntimeError(f"Error loading audio file {file_path}: {str(e)}") | |
def preprocess_audio(self, audio_path: str) -> Tuple[np.ndarray, np.ndarray]: | |
""" | |
预处理音频文件: | |
1. 加载音频 | |
2. 重采样到目标采样率 | |
3. 将音频裁剪或填充到固定长度 | |
4. 计算梅尔频谱图 | |
""" | |
# 加载音频 | |
audio, sr = librosa.load(audio_path, sr=self.config.sample_rate) | |
# 处理音频长度 | |
if len(audio) > self.target_length: | |
# 随机裁剪到目标长度 | |
start = np.random.randint(0, len(audio) - self.target_length) | |
audio = audio[start:start + self.target_length] | |
else: | |
# 填充到目标长度 | |
padding = self.target_length - len(audio) | |
audio = np.pad(audio, (0, padding), mode='constant') | |
# 计算梅尔频谱图 | |
mel_spec = librosa.feature.melspectrogram( | |
y=audio, | |
sr=self.config.sample_rate, | |
n_fft=self.config.n_fft, | |
hop_length=self.config.hop_length, | |
win_length=self.config.win_length, | |
n_mels=self.config.n_mels, | |
fmin=self.config.mel_fmin, | |
fmax=self.config.mel_fmax | |
) | |
# 转换为分贝单位 | |
mel_spec = librosa.power_to_db(mel_spec, ref=np.max) | |
# 确保梅尔频谱图的维度正确:(n_mels, time) | |
# 如果需要,调整时间维度到固定长度 | |
target_time_steps = 32 # 固定时间步长 | |
if mel_spec.shape[1] > target_time_steps: | |
mel_spec = mel_spec[:, :target_time_steps] | |
else: | |
pad_width = ((0, 0), (0, target_time_steps - mel_spec.shape[1])) | |
mel_spec = np.pad(mel_spec, pad_width, mode='constant') | |
return audio, mel_spec | |
def audio_to_mel_spec(self, audio: np.ndarray) -> np.ndarray: | |
"""将音频转换为梅尔频谱图""" | |
# 计算短时傅里叶变换 | |
stft = librosa.stft( | |
audio, | |
n_fft=self.config.n_fft, | |
hop_length=self.config.hop_length, | |
win_length=self.config.win_length | |
) | |
# 计算幅度谱 | |
magnitude = np.abs(stft) | |
# 生成梅尔滤波器组 | |
mel_basis = librosa.filters.mel( | |
sr=self.config.sample_rate, | |
n_fft=self.config.n_fft, | |
n_mels=self.config.n_mels, | |
fmin=self.config.mel_fmin, | |
fmax=self.config.mel_fmax | |
) | |
# 应用梅尔滤波器 | |
mel_spec = np.dot(mel_basis, magnitude) | |
# 转换为对数刻度 | |
mel_spec = np.log(np.clip(mel_spec, a_min=1e-5, a_max=None)) | |
return mel_spec | |
def normalize_audio(self, audio: np.ndarray) -> np.ndarray: | |
"""音频归一化""" | |
return audio / np.max(np.abs(audio)) | |
def pad_or_trim(self, audio: np.ndarray, target_length: int) -> np.ndarray: | |
"""将音频填充或裁剪到指定长度""" | |
if len(audio) > target_length: | |
return audio[:target_length] | |
else: | |
return np.pad(audio, (0, target_length - len(audio)), mode='constant') |