voice-clone-app / src /data /audio_processor.py
hengjie yang
Initial commit: Voice Clone App with Gradio interface
9580089
import librosa
import numpy as np
import torch
from typing import Tuple
import os
import subprocess
import tempfile
from ..configs.config import AudioConfig
class AudioProcessor:
"""音频处理器:负责音频的预处理和特征提取"""
def __init__(self, config: AudioConfig):
self.config = config
self.target_length = 16000 # 固定音频长度为1秒
def load_audio(self, file_path: str) -> np.ndarray:
"""加载音频文件,支持多种格式包括m4a"""
try:
# 检查是否是m4a文件
if file_path.lower().endswith('.m4a'):
# 创建临时wav文件
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
temp_wav_path = temp_wav.name
try:
# 使用ffmpeg将m4a转换为wav
command = [
'ffmpeg',
'-i', file_path,
'-acodec', 'pcm_s16le',
'-ar', str(self.config.sample_rate),
'-ac', '1', # 转换为单声道
'-y', # 覆盖已存在的文件
temp_wav_path
]
subprocess.run(command, check=True, capture_output=True)
# 加载转换后的wav文件
audio, _ = librosa.load(temp_wav_path, sr=self.config.sample_rate)
finally:
# 清理临时文件
if os.path.exists(temp_wav_path):
os.unlink(temp_wav_path)
else:
# 直接加载其他格式的音频文件
audio, _ = librosa.load(file_path, sr=self.config.sample_rate)
return audio
except Exception as e:
raise RuntimeError(f"Error loading audio file {file_path}: {str(e)}")
def preprocess_audio(self, audio_path: str) -> Tuple[np.ndarray, np.ndarray]:
"""
预处理音频文件:
1. 加载音频
2. 重采样到目标采样率
3. 将音频裁剪或填充到固定长度
4. 计算梅尔频谱图
"""
# 加载音频
audio, sr = librosa.load(audio_path, sr=self.config.sample_rate)
# 处理音频长度
if len(audio) > self.target_length:
# 随机裁剪到目标长度
start = np.random.randint(0, len(audio) - self.target_length)
audio = audio[start:start + self.target_length]
else:
# 填充到目标长度
padding = self.target_length - len(audio)
audio = np.pad(audio, (0, padding), mode='constant')
# 计算梅尔频谱图
mel_spec = librosa.feature.melspectrogram(
y=audio,
sr=self.config.sample_rate,
n_fft=self.config.n_fft,
hop_length=self.config.hop_length,
win_length=self.config.win_length,
n_mels=self.config.n_mels,
fmin=self.config.mel_fmin,
fmax=self.config.mel_fmax
)
# 转换为分贝单位
mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
# 确保梅尔频谱图的维度正确:(n_mels, time)
# 如果需要,调整时间维度到固定长度
target_time_steps = 32 # 固定时间步长
if mel_spec.shape[1] > target_time_steps:
mel_spec = mel_spec[:, :target_time_steps]
else:
pad_width = ((0, 0), (0, target_time_steps - mel_spec.shape[1]))
mel_spec = np.pad(mel_spec, pad_width, mode='constant')
return audio, mel_spec
def audio_to_mel_spec(self, audio: np.ndarray) -> np.ndarray:
"""将音频转换为梅尔频谱图"""
# 计算短时傅里叶变换
stft = librosa.stft(
audio,
n_fft=self.config.n_fft,
hop_length=self.config.hop_length,
win_length=self.config.win_length
)
# 计算幅度谱
magnitude = np.abs(stft)
# 生成梅尔滤波器组
mel_basis = librosa.filters.mel(
sr=self.config.sample_rate,
n_fft=self.config.n_fft,
n_mels=self.config.n_mels,
fmin=self.config.mel_fmin,
fmax=self.config.mel_fmax
)
# 应用梅尔滤波器
mel_spec = np.dot(mel_basis, magnitude)
# 转换为对数刻度
mel_spec = np.log(np.clip(mel_spec, a_min=1e-5, a_max=None))
return mel_spec
def normalize_audio(self, audio: np.ndarray) -> np.ndarray:
"""音频归一化"""
return audio / np.max(np.abs(audio))
def pad_or_trim(self, audio: np.ndarray, target_length: int) -> np.ndarray:
"""将音频填充或裁剪到指定长度"""
if len(audio) > target_length:
return audio[:target_length]
else:
return np.pad(audio, (0, target_length - len(audio)), mode='constant')