Spaces:

pupunpu
/

voice-clone-app

Sleeping

voice-clone-app / src /data /audio_processor.py

hengjie yang

Initial commit: Voice Clone App with Gradio interface

9580089 3 months ago

5.29 kB

	import librosa
	import numpy as np
	import torch
	from typing import Tuple
	import os
	import subprocess
	import tempfile
	from ..configs.config import AudioConfig

	class AudioProcessor:
	"""音频处理器：负责音频的预处理和特征提取"""

	def __init__(self, config: AudioConfig):
	self.config = config
	self.target_length = 16000 # 固定音频长度为1秒

	def load_audio(self, file_path: str) -> np.ndarray:
	"""加载音频文件，支持多种格式包括m4a"""
	try:
	# 检查是否是m4a文件
	if file_path.lower().endswith('.m4a'):
	# 创建临时wav文件
	with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
	temp_wav_path = temp_wav.name

	try:
	# 使用ffmpeg将m4a转换为wav
	command = [
	'ffmpeg',
	'-i', file_path,
	'-acodec', 'pcm_s16le',
	'-ar', str(self.config.sample_rate),
	'-ac', '1', # 转换为单声道
	'-y', # 覆盖已存在的文件
	temp_wav_path
	]

	subprocess.run(command, check=True, capture_output=True)

	# 加载转换后的wav文件
	audio, _ = librosa.load(temp_wav_path, sr=self.config.sample_rate)

	finally:
	# 清理临时文件
	if os.path.exists(temp_wav_path):
	os.unlink(temp_wav_path)
	else:
	# 直接加载其他格式的音频文件
	audio, _ = librosa.load(file_path, sr=self.config.sample_rate)

	return audio

	except Exception as e:
	raise RuntimeError(f"Error loading audio file {file_path}: {str(e)}")

	def preprocess_audio(self, audio_path: str) -> Tuple[np.ndarray, np.ndarray]:
	"""
	预处理音频文件：
	1. 加载音频
	2. 重采样到目标采样率
	3. 将音频裁剪或填充到固定长度
	4. 计算梅尔频谱图
	"""
	# 加载音频
	audio, sr = librosa.load(audio_path, sr=self.config.sample_rate)

	# 处理音频长度
	if len(audio) > self.target_length:
	# 随机裁剪到目标长度
	start = np.random.randint(0, len(audio) - self.target_length)
	audio = audio[start:start + self.target_length]
	else:
	# 填充到目标长度
	padding = self.target_length - len(audio)
	audio = np.pad(audio, (0, padding), mode='constant')

	# 计算梅尔频谱图
	mel_spec = librosa.feature.melspectrogram(
	y=audio,
	sr=self.config.sample_rate,
	n_fft=self.config.n_fft,
	hop_length=self.config.hop_length,
	win_length=self.config.win_length,
	n_mels=self.config.n_mels,
	fmin=self.config.mel_fmin,
	fmax=self.config.mel_fmax
	)

	# 转换为分贝单位
	mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

	# 确保梅尔频谱图的维度正确：(n_mels, time)
	# 如果需要，调整时间维度到固定长度
	target_time_steps = 32 # 固定时间步长
	if mel_spec.shape[1] > target_time_steps:
	mel_spec = mel_spec[:, :target_time_steps]
	else:
	pad_width = ((0, 0), (0, target_time_steps - mel_spec.shape[1]))
	mel_spec = np.pad(mel_spec, pad_width, mode='constant')

	return audio, mel_spec

	def audio_to_mel_spec(self, audio: np.ndarray) -> np.ndarray:
	"""将音频转换为梅尔频谱图"""
	# 计算短时傅里叶变换
	stft = librosa.stft(
	audio,
	n_fft=self.config.n_fft,
	hop_length=self.config.hop_length,
	win_length=self.config.win_length
	)

	# 计算幅度谱
	magnitude = np.abs(stft)

	# 生成梅尔滤波器组
	mel_basis = librosa.filters.mel(
	sr=self.config.sample_rate,
	n_fft=self.config.n_fft,
	n_mels=self.config.n_mels,
	fmin=self.config.mel_fmin,
	fmax=self.config.mel_fmax
	)

	# 应用梅尔滤波器
	mel_spec = np.dot(mel_basis, magnitude)

	# 转换为对数刻度
	mel_spec = np.log(np.clip(mel_spec, a_min=1e-5, a_max=None))

	return mel_spec

	def normalize_audio(self, audio: np.ndarray) -> np.ndarray:
	"""音频归一化"""
	return audio / np.max(np.abs(audio))

	def pad_or_trim(self, audio: np.ndarray, target_length: int) -> np.ndarray:
	"""将音频填充或裁剪到指定长度"""
	if len(audio) > target_length:
	return audio[:target_length]
	else:
	return np.pad(audio, (0, target_length - len(audio)), mode='constant')