Spaces:
Runtime error
Runtime error
File size: 3,695 Bytes
1ee91f8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import torch
import os
from huggingface_hub import hf_hub_download
from src.sbv2.synthesizer_trn import SynthesizerTrn
from src.sbv2.text import text_to_sequence
from src.sbv2.commons import get_hparams_from_file
# 環境変数から取得
MODEL_REPO = os.getenv("MODEL_REPO")
HF_TOKEN = os.getenv("HF_TOKEN")
CACHE_DIR = "/tmp/hf_cache"
# モデルとデバイスをグローバル変数として用意
model = None
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def load_model():
global model
# Hugging Faceからモデルファイルをダウンロード
config_path = hf_hub_download(repo_id=MODEL_REPO, filename="config.json", token=HF_TOKEN, cache_dir=CACHE_DIR)
model_path = hf_hub_download(repo_id=MODEL_REPO, filename="model.safetensors", token=HF_TOKEN, cache_dir=CACHE_DIR)
style_path = hf_hub_download(repo_id=MODEL_REPO, filename="style_vectors.npy", token=HF_TOKEN, cache_dir=CACHE_DIR)
# configをロード
hps = get_hparams_from_file(config_path)
# モデルを初期化
model = SynthesizerTrn(
n_vocab=70, # 仮設定(※symbolsが無いため一般的な日本語TTS想定)
spec_channels=hps["model"].get("spec_channels", 80),
segment_size=None,
inter_channels=hps["model"]["hidden_channels"],
hidden_channels=hps["model"]["hidden_channels"],
filter_channels=hps["model"]["filter_channels"],
n_heads=hps["model"]["n_heads"],
n_layers=int(hps["model"]["encoder_n_layers"]),
kernel_size=hps["model"]["encoder_kernel_size"],
p_dropout=hps["model"]["dropout"],
resblock=str(hps["model"].get("resblock", 2)),
resblock_kernel_sizes=hps["model"]["resblock_kernel_sizes"],
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5], [1, 3, 5]],
upsample_rates=hps["model"]["upsample_rates"],
upsample_initial_channel=512, # 通常512固定
upsample_kernel_sizes=hps["model"]["upsample_kernel_sizes"],
gin_channels=hps["model"]["gin_channels"],
out_channels=hps["model"].get("spec_channels", 80),
dec_kernel_size=hps["model"]["encoder_kernel_size"],
enc_channels=hps["model"]["encoder_hidden"],
enc_out_channels=hps["model"]["encoder_hidden"] * 2,
enc_kernel_size=hps["model"]["encoder_kernel_size"],
enc_dilation_rate=hps["model"].get("enc_dilation_rate", 1),
enc_n_layers=int(hps["model"]["encoder_n_layers"]),
flow_hidden_channels=hps["model"]["hidden_channels"],
flow_kernel_size=hps["model"]["flow_kernel_size"],
flow_n_layers=int(hps["model"]["flow_n_layers"]),
flow_n_flows=int(hps["model"]["flow_n_flows"]),
sdp_hidden_channels=hps["model"]["sdp_filter_channels"],
sdp_kernel_size=hps["model"]["sdp_kernel_size"],
sdp_n_layers=int(hps["model"]["sdp_n_layers"]),
sdp_dropout=hps["model"]["sdp_dropout"],
sampling_rate=hps["data"]["sampling_rate"],
filter_length=1024,
hop_length=256,
win_length=1024,
).to(device)
# safetensorsで重み読み込み
from safetensors.torch import load_file
model_sd = load_file(model_path)
model.load_state_dict(model_sd, strict=True)
model.eval()
def synthesize_voice(text):
# 推論を実行
x = torch.LongTensor(text_to_sequence(text, ['basic_cleaners'])).unsqueeze(0).to(device)
x_lengths = torch.LongTensor([x.size(1)]).to(device)
sid = torch.LongTensor([0]).to(device)
with torch.no_grad():
audio = model.infer(x, x_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1.0)[0][0, 0].cpu().numpy()
return audio
|