Spaces:
Sleeping
Sleeping
import torch | |
import torchaudio | |
import numpy as np | |
from pathlib import Path | |
from typing import List, Union | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
from speechbrain.pretrained import EncoderClassifier | |
import tempfile | |
import os | |
class VoiceCloneSystem: | |
"""语音克隆系统:将输入文本转换为目标说话人的语音""" | |
def __init__(self, device: str = "cpu"): | |
""" | |
初始化语音克隆系统 | |
Args: | |
device: 使用的设备,'cpu' 或 'cuda' | |
""" | |
self.device = device | |
print("正在加载模型...") | |
# 加载说话人编码器 | |
self.speaker_encoder = EncoderClassifier.from_hparams( | |
source="speechbrain/spkrec-xvect-voxceleb", | |
savedir="tmp/spkrec-xvect-voxceleb", | |
run_opts={"device": device} | |
) | |
# 加载文本到语音模型 | |
self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
self.tts_model = SpeechT5ForTextToSpeech.from_pretrained( | |
"microsoft/speecht5_tts" | |
).to(device) | |
# 加载声码器 | |
self.vocoder = SpeechT5HifiGan.from_pretrained( | |
"microsoft/speecht5_hifigan" | |
).to(device) | |
print("模型加载完成!") | |
def process_audio(self, waveform: torch.Tensor, sr: int) -> torch.Tensor: | |
""" | |
处理音频:重采样和转换为单声道 | |
Args: | |
waveform: 输入音频波形 | |
sr: 采样率 | |
Returns: | |
处理后的音频波形 | |
""" | |
# 重采样到16kHz | |
if sr != 16000: | |
waveform = torchaudio.functional.resample(waveform, sr, 16000) | |
# 确保音频是单声道 | |
if waveform.shape[0] > 1: | |
waveform = torch.mean(waveform, dim=0, keepdim=True) | |
# 标准化音频长度(3秒) | |
target_length = 16000 * 3 | |
current_length = waveform.shape[1] | |
if current_length > target_length: | |
# 如果太长,截取中间部分 | |
start = (current_length - target_length) // 2 | |
waveform = waveform[:, start:start + target_length] | |
elif current_length < target_length: | |
# 如果太短,用0填充 | |
padding = torch.zeros(1, target_length - current_length) | |
waveform = torch.cat([waveform, padding], dim=1) | |
return waveform | |
def extract_speaker_embedding( | |
self, | |
audio_paths: List[Union[str, Path]] | |
) -> torch.Tensor: | |
""" | |
从参考音频中提取说话人特征 | |
Args: | |
audio_paths: 参考音频文件路径列表 | |
Returns: | |
说话人特征向量 | |
""" | |
embeddings = [] | |
for audio_path in audio_paths: | |
try: | |
# 加载音频 | |
waveform, sr = torchaudio.load(str(audio_path)) | |
# 处理音频 | |
waveform = self.process_audio(waveform, sr) | |
# 提取特征 | |
with torch.no_grad(): | |
# 确保输入维度正确 [batch, time] | |
if waveform.dim() == 2: | |
waveform = waveform.squeeze(0) | |
# 提取特征并处理维度 | |
embedding = self.speaker_encoder.encode_batch(waveform.unsqueeze(0).to(self.device)) | |
embedding = embedding.squeeze() # 移除所有维度为1的维度 | |
# 打印中间结果 | |
print(f"Raw embedding shape: {embedding.shape}") | |
embeddings.append(embedding) | |
except Exception as e: | |
print(f"Error processing audio {audio_path}: {str(e)}") | |
raise | |
# 计算平均特征 | |
mean_embedding = torch.stack(embeddings).mean(dim=0) | |
# 确保最终维度正确 [1, 512] | |
if mean_embedding.dim() == 1: | |
mean_embedding = mean_embedding.unsqueeze(0) | |
# 打印最终维度 | |
print(f"Final embedding shape: {mean_embedding.shape}") | |
return mean_embedding | |
def generate_speech( | |
self, | |
text: str, | |
speaker_embedding: torch.Tensor | |
) -> torch.Tensor: | |
""" | |
生成语音 | |
Args: | |
text: 输入文本 | |
speaker_embedding: 说话人特征向量 | |
Returns: | |
生成的语音波形 | |
""" | |
try: | |
# 处理输入文本 | |
inputs = self.processor(text=text, return_tensors="pt") | |
# 确保说话人特征维度正确 | |
if speaker_embedding.dim() != 2 or speaker_embedding.size(1) != 512: | |
raise ValueError(f"Speaker embedding should have shape [1, 512], but got {speaker_embedding.shape}") | |
# 生成语音 | |
speech = self.tts_model.generate_speech( | |
inputs["input_ids"].to(self.device), | |
speaker_embedding.to(self.device), | |
vocoder=self.vocoder | |
) | |
return speech | |
except Exception as e: | |
print(f"Error in generate_speech: {str(e)}") | |
raise | |
def clone_voice( | |
self, | |
text: str, | |
reference_audio_paths: List[Union[str, Path]] | |
) -> torch.Tensor: | |
""" | |
主函数:克隆声音 | |
Args: | |
text: 要转换的文本 | |
reference_audio_paths: 参考音频文件路径列表 | |
Returns: | |
生成的语音波形 | |
""" | |
try: | |
# 1. 提取说话人特征 | |
speaker_embedding = self.extract_speaker_embedding(reference_audio_paths) | |
# 2. 生成语音 | |
speech = self.generate_speech(text, speaker_embedding) | |
return speech | |
except Exception as e: | |
print(f"Error in clone_voice: {str(e)}") | |
raise | |
def save_audio( | |
self, | |
waveform: torch.Tensor, | |
output_path: Union[str, Path], | |
sample_rate: int = 16000 | |
): | |
""" | |
保存音频文件 | |
Args: | |
waveform: 音频波形 | |
output_path: 输出文件路径 | |
sample_rate: 采样率 | |
""" | |
try: | |
# 确保输出目录存在 | |
output_path = Path(output_path) | |
output_path.parent.mkdir(parents=True, exist_ok=True) | |
# 保存音频 | |
torchaudio.save( | |
str(output_path), | |
waveform.unsqueeze(0).cpu(), | |
sample_rate | |
) | |
except Exception as e: | |
print(f"Error saving audio: {str(e)}") | |
raise |