Spaces:
Sleeping
Sleeping
File size: 7,132 Bytes
9580089 a03d4c1 9580089 a03d4c1 9580089 a03d4c1 4ecc033 a03d4c1 4ecc033 a03d4c1 4ecc033 a03d4c1 9580089 a03d4c1 9580089 4ecc033 a03d4c1 4ecc033 9580089 a03d4c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
import torch
import torchaudio
import numpy as np
from pathlib import Path
from typing import List, Union
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
import tempfile
import os
class VoiceCloneSystem:
"""语音克隆系统:将输入文本转换为目标说话人的语音"""
def __init__(self, device: str = "cpu"):
"""
初始化语音克隆系统
Args:
device: 使用的设备,'cpu' 或 'cuda'
"""
self.device = device
print("正在加载模型...")
# 加载说话人编码器
self.speaker_encoder = EncoderClassifier.from_hparams(
source="speechbrain/spkrec-xvect-voxceleb",
savedir="tmp/spkrec-xvect-voxceleb",
run_opts={"device": device}
)
# 加载文本到语音模型
self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
self.tts_model = SpeechT5ForTextToSpeech.from_pretrained(
"microsoft/speecht5_tts"
).to(device)
# 加载声码器
self.vocoder = SpeechT5HifiGan.from_pretrained(
"microsoft/speecht5_hifigan"
).to(device)
print("模型加载完成!")
def process_audio(self, waveform: torch.Tensor, sr: int) -> torch.Tensor:
"""
处理音频:重采样和转换为单声道
Args:
waveform: 输入音频波形
sr: 采样率
Returns:
处理后的音频波形
"""
# 重采样到16kHz
if sr != 16000:
waveform = torchaudio.functional.resample(waveform, sr, 16000)
# 确保音频是单声道
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
# 标准化音频长度(3秒)
target_length = 16000 * 3
current_length = waveform.shape[1]
if current_length > target_length:
# 如果太长,截取中间部分
start = (current_length - target_length) // 2
waveform = waveform[:, start:start + target_length]
elif current_length < target_length:
# 如果太短,用0填充
padding = torch.zeros(1, target_length - current_length)
waveform = torch.cat([waveform, padding], dim=1)
return waveform
def extract_speaker_embedding(
self,
audio_paths: List[Union[str, Path]]
) -> torch.Tensor:
"""
从参考音频中提取说话人特征
Args:
audio_paths: 参考音频文件路径列表
Returns:
说话人特征向量
"""
embeddings = []
for audio_path in audio_paths:
try:
# 加载音频
waveform, sr = torchaudio.load(str(audio_path))
# 处理音频
waveform = self.process_audio(waveform, sr)
# 提取特征
with torch.no_grad():
# 确保输入维度正确 [batch, time]
if waveform.dim() == 2:
waveform = waveform.squeeze(0)
# 提取特征并处理维度
embedding = self.speaker_encoder.encode_batch(waveform.unsqueeze(0).to(self.device))
embedding = embedding.squeeze() # 移除所有维度为1的维度
# 打印中间结果
print(f"Raw embedding shape: {embedding.shape}")
embeddings.append(embedding)
except Exception as e:
print(f"Error processing audio {audio_path}: {str(e)}")
raise
# 计算平均特征
mean_embedding = torch.stack(embeddings).mean(dim=0)
# 确保最终维度正确 [1, 512]
if mean_embedding.dim() == 1:
mean_embedding = mean_embedding.unsqueeze(0)
# 打印最终维度
print(f"Final embedding shape: {mean_embedding.shape}")
return mean_embedding
def generate_speech(
self,
text: str,
speaker_embedding: torch.Tensor
) -> torch.Tensor:
"""
生成语音
Args:
text: 输入文本
speaker_embedding: 说话人特征向量
Returns:
生成的语音波形
"""
try:
# 处理输入文本
inputs = self.processor(text=text, return_tensors="pt")
# 确保说话人特征维度正确
if speaker_embedding.dim() != 2 or speaker_embedding.size(1) != 512:
raise ValueError(f"Speaker embedding should have shape [1, 512], but got {speaker_embedding.shape}")
# 生成语音
speech = self.tts_model.generate_speech(
inputs["input_ids"].to(self.device),
speaker_embedding.to(self.device),
vocoder=self.vocoder
)
return speech
except Exception as e:
print(f"Error in generate_speech: {str(e)}")
raise
def clone_voice(
self,
text: str,
reference_audio_paths: List[Union[str, Path]]
) -> torch.Tensor:
"""
主函数:克隆声音
Args:
text: 要转换的文本
reference_audio_paths: 参考音频文件路径列表
Returns:
生成的语音波形
"""
try:
# 1. 提取说话人特征
speaker_embedding = self.extract_speaker_embedding(reference_audio_paths)
# 2. 生成语音
speech = self.generate_speech(text, speaker_embedding)
return speech
except Exception as e:
print(f"Error in clone_voice: {str(e)}")
raise
def save_audio(
self,
waveform: torch.Tensor,
output_path: Union[str, Path],
sample_rate: int = 16000
):
"""
保存音频文件
Args:
waveform: 音频波形
output_path: 输出文件路径
sample_rate: 采样率
"""
try:
# 确保输出目录存在
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
# 保存音频
torchaudio.save(
str(output_path),
waveform.unsqueeze(0).cpu(),
sample_rate
)
except Exception as e:
print(f"Error saving audio: {str(e)}")
raise |