File size: 5,290 Bytes
9580089
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import librosa
import numpy as np
import torch
from typing import Tuple
import os
import subprocess
import tempfile
from ..configs.config import AudioConfig

class AudioProcessor:
    """音频处理器:负责音频的预处理和特征提取"""
    
    def __init__(self, config: AudioConfig):
        self.config = config
        self.target_length = 16000  # 固定音频长度为1秒
        
    def load_audio(self, file_path: str) -> np.ndarray:
        """加载音频文件,支持多种格式包括m4a"""
        try:
            # 检查是否是m4a文件
            if file_path.lower().endswith('.m4a'):
                # 创建临时wav文件
                with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
                    temp_wav_path = temp_wav.name
                
                try:
                    # 使用ffmpeg将m4a转换为wav
                    command = [
                        'ffmpeg',
                        '-i', file_path,
                        '-acodec', 'pcm_s16le',
                        '-ar', str(self.config.sample_rate),
                        '-ac', '1',  # 转换为单声道
                        '-y',  # 覆盖已存在的文件
                        temp_wav_path
                    ]
                    
                    subprocess.run(command, check=True, capture_output=True)
                    
                    # 加载转换后的wav文件
                    audio, _ = librosa.load(temp_wav_path, sr=self.config.sample_rate)
                    
                finally:
                    # 清理临时文件
                    if os.path.exists(temp_wav_path):
                        os.unlink(temp_wav_path)
            else:
                # 直接加载其他格式的音频文件
                audio, _ = librosa.load(file_path, sr=self.config.sample_rate)
                
            return audio
            
        except Exception as e:
            raise RuntimeError(f"Error loading audio file {file_path}: {str(e)}")
            
    def preprocess_audio(self, audio_path: str) -> Tuple[np.ndarray, np.ndarray]:
        """
        预处理音频文件:
        1. 加载音频
        2. 重采样到目标采样率
        3. 将音频裁剪或填充到固定长度
        4. 计算梅尔频谱图
        """
        # 加载音频
        audio, sr = librosa.load(audio_path, sr=self.config.sample_rate)
        
        # 处理音频长度
        if len(audio) > self.target_length:
            # 随机裁剪到目标长度
            start = np.random.randint(0, len(audio) - self.target_length)
            audio = audio[start:start + self.target_length]
        else:
            # 填充到目标长度
            padding = self.target_length - len(audio)
            audio = np.pad(audio, (0, padding), mode='constant')
            
        # 计算梅尔频谱图
        mel_spec = librosa.feature.melspectrogram(
            y=audio,
            sr=self.config.sample_rate,
            n_fft=self.config.n_fft,
            hop_length=self.config.hop_length,
            win_length=self.config.win_length,
            n_mels=self.config.n_mels,
            fmin=self.config.mel_fmin,
            fmax=self.config.mel_fmax
        )
        
        # 转换为分贝单位
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
        
        # 确保梅尔频谱图的维度正确:(n_mels, time)
        # 如果需要,调整时间维度到固定长度
        target_time_steps = 32  # 固定时间步长
        if mel_spec.shape[1] > target_time_steps:
            mel_spec = mel_spec[:, :target_time_steps]
        else:
            pad_width = ((0, 0), (0, target_time_steps - mel_spec.shape[1]))
            mel_spec = np.pad(mel_spec, pad_width, mode='constant')
            
        return audio, mel_spec
        
    def audio_to_mel_spec(self, audio: np.ndarray) -> np.ndarray:
        """将音频转换为梅尔频谱图"""
        # 计算短时傅里叶变换
        stft = librosa.stft(
            audio,
            n_fft=self.config.n_fft,
            hop_length=self.config.hop_length,
            win_length=self.config.win_length
        )
        
        # 计算幅度谱
        magnitude = np.abs(stft)
        
        # 生成梅尔滤波器组
        mel_basis = librosa.filters.mel(
            sr=self.config.sample_rate,
            n_fft=self.config.n_fft,
            n_mels=self.config.n_mels,
            fmin=self.config.mel_fmin,
            fmax=self.config.mel_fmax
        )
        
        # 应用梅尔滤波器
        mel_spec = np.dot(mel_basis, magnitude)
        
        # 转换为对数刻度
        mel_spec = np.log(np.clip(mel_spec, a_min=1e-5, a_max=None))
        
        return mel_spec
        
    def normalize_audio(self, audio: np.ndarray) -> np.ndarray:
        """音频归一化"""
        return audio / np.max(np.abs(audio))
        
    def pad_or_trim(self, audio: np.ndarray, target_length: int) -> np.ndarray:
        """将音频填充或裁剪到指定长度"""
        if len(audio) > target_length:
            return audio[:target_length]
        else:
            return np.pad(audio, (0, target_length - len(audio)), mode='constant')