Spaces:

pupunpu
/

voice-clone-app

Sleeping

File size: 5,290 Bytes
import librosa
import numpy as np
import torch
from typing import Tuple
import os
import subprocess
import tempfile
from ..configs.config import AudioConfig

class AudioProcessor:
    """音频处理器：负责音频的预处理和特征提取"""
    
    def __init__(self, config: AudioConfig):
        self.config = config
        self.target_length = 16000  # 固定音频长度为1秒
        
    def load_audio(self, file_path: str) -> np.ndarray:
        """加载音频文件，支持多种格式包括m4a"""
        try:
            # 检查是否是m4a文件
            if file_path.lower().endswith('.m4a'):
                # 创建临时wav文件
                with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
                    temp_wav_path = temp_wav.name
                
                try:
                    # 使用ffmpeg将m4a转换为wav
                    command = [
                        'ffmpeg',
                        '-i', file_path,
                        '-acodec', 'pcm_s16le',
                        '-ar', str(self.config.sample_rate),
                        '-ac', '1',  # 转换为单声道
                        '-y',  # 覆盖已存在的文件
                        temp_wav_path
                    ]
                    
                    subprocess.run(command, check=True, capture_output=True)
                    
                    # 加载转换后的wav文件
                    audio, _ = librosa.load(temp_wav_path, sr=self.config.sample_rate)
                    
                finally:
                    # 清理临时文件
                    if os.path.exists(temp_wav_path):
                        os.unlink(temp_wav_path)
            else:
                # 直接加载其他格式的音频文件
                audio, _ = librosa.load(file_path, sr=self.config.sample_rate)
                
            return audio
            
        except Exception as e:
            raise RuntimeError(f"Error loading audio file {file_path}: {str(e)}")
            
    def preprocess_audio(self, audio_path: str) -> Tuple[np.ndarray, np.ndarray]:
        """
        预处理音频文件：
        1. 加载音频
        2. 重采样到目标采样率
        3. 将音频裁剪或填充到固定长度
        4. 计算梅尔频谱图
        """
        # 加载音频
        audio, sr = librosa.load(audio_path, sr=self.config.sample_rate)
        
        # 处理音频长度
        if len(audio) > self.target_length:
            # 随机裁剪到目标长度
            start = np.random.randint(0, len(audio) - self.target_length)
            audio = audio[start:start + self.target_length]
        else:
            # 填充到目标长度
            padding = self.target_length - len(audio)
            audio = np.pad(audio, (0, padding), mode='constant')
            
        # 计算梅尔频谱图
        mel_spec = librosa.feature.melspectrogram(
            y=audio,
            sr=self.config.sample_rate,
            n_fft=self.config.n_fft,
            hop_length=self.config.hop_length,
            win_length=self.config.win_length,
            n_mels=self.config.n_mels,
            fmin=self.config.mel_fmin,
            fmax=self.config.mel_fmax
        )
        
        # 转换为分贝单位
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
        
        # 确保梅尔频谱图的维度正确：(n_mels, time)
        # 如果需要，调整时间维度到固定长度
        target_time_steps = 32  # 固定时间步长
        if mel_spec.shape[1] > target_time_steps:
            mel_spec = mel_spec[:, :target_time_steps]
        else:
            pad_width = ((0, 0), (0, target_time_steps - mel_spec.shape[1]))
            mel_spec = np.pad(mel_spec, pad_width, mode='constant')
            
        return audio, mel_spec
        
    def audio_to_mel_spec(self, audio: np.ndarray) -> np.ndarray:
        """将音频转换为梅尔频谱图"""
        # 计算短时傅里叶变换
        stft = librosa.stft(
            audio,
            n_fft=self.config.n_fft,
            hop_length=self.config.hop_length,
            win_length=self.config.win_length
        )
        
        # 计算幅度谱
        magnitude = np.abs(stft)
        
        # 生成梅尔滤波器组
        mel_basis = librosa.filters.mel(
            sr=self.config.sample_rate,
            n_fft=self.config.n_fft,
            n_mels=self.config.n_mels,
            fmin=self.config.mel_fmin,
            fmax=self.config.mel_fmax
        )
        
        # 应用梅尔滤波器
        mel_spec = np.dot(mel_basis, magnitude)
        
        # 转换为对数刻度
        mel_spec = np.log(np.clip(mel_spec, a_min=1e-5, a_max=None))
        
        return mel_spec
        
    def normalize_audio(self, audio: np.ndarray) -> np.ndarray:
        """音频归一化"""
        return audio / np.max(np.abs(audio))
        
    def pad_or_trim(self, audio: np.ndarray, target_length: int) -> np.ndarray:
        """将音频填充或裁剪到指定长度"""
        if len(audio) > target_length:
            return audio[:target_length]
        else:
            return np.pad(audio, (0, target_length - len(audio)), mode='constant')