Spaces:
Runtime error
Runtime error
import torch | |
import os | |
from huggingface_hub import hf_hub_download | |
from src.sbv2.synthesizer_trn import SynthesizerTrn | |
from src.sbv2.text import text_to_sequence | |
from src.sbv2.commons import get_hparams_from_file | |
# 環境変数から取得 | |
MODEL_REPO = os.getenv("MODEL_REPO") | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
CACHE_DIR = "/tmp/hf_cache" | |
# モデルとデバイスをグローバル変数として用意 | |
model = None | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
def load_model(): | |
global model | |
# Hugging Faceからモデルファイルをダウンロード | |
config_path = hf_hub_download(repo_id=MODEL_REPO, filename="config.json", token=HF_TOKEN, cache_dir=CACHE_DIR) | |
model_path = hf_hub_download(repo_id=MODEL_REPO, filename="model.safetensors", token=HF_TOKEN, cache_dir=CACHE_DIR) | |
style_path = hf_hub_download(repo_id=MODEL_REPO, filename="style_vectors.npy", token=HF_TOKEN, cache_dir=CACHE_DIR) | |
# configをロード | |
hps = get_hparams_from_file(config_path) | |
# モデルを初期化 | |
model = SynthesizerTrn( | |
n_vocab=70, # 仮設定(※symbolsが無いため一般的な日本語TTS想定) | |
spec_channels=hps["model"].get("spec_channels", 80), | |
segment_size=None, | |
inter_channels=hps["model"]["hidden_channels"], | |
hidden_channels=hps["model"]["hidden_channels"], | |
filter_channels=hps["model"]["filter_channels"], | |
n_heads=hps["model"]["n_heads"], | |
n_layers=int(hps["model"]["encoder_n_layers"]), | |
kernel_size=hps["model"]["encoder_kernel_size"], | |
p_dropout=hps["model"]["dropout"], | |
resblock=str(hps["model"].get("resblock", 2)), | |
resblock_kernel_sizes=hps["model"]["resblock_kernel_sizes"], | |
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5], [1, 3, 5]], | |
upsample_rates=hps["model"]["upsample_rates"], | |
upsample_initial_channel=512, # 通常512固定 | |
upsample_kernel_sizes=hps["model"]["upsample_kernel_sizes"], | |
gin_channels=hps["model"]["gin_channels"], | |
out_channels=hps["model"].get("spec_channels", 80), | |
dec_kernel_size=hps["model"]["encoder_kernel_size"], | |
enc_channels=hps["model"]["encoder_hidden"], | |
enc_out_channels=hps["model"]["encoder_hidden"] * 2, | |
enc_kernel_size=hps["model"]["encoder_kernel_size"], | |
enc_dilation_rate=hps["model"].get("enc_dilation_rate", 1), | |
enc_n_layers=int(hps["model"]["encoder_n_layers"]), | |
flow_hidden_channels=hps["model"]["hidden_channels"], | |
flow_kernel_size=hps["model"]["flow_kernel_size"], | |
flow_n_layers=int(hps["model"]["flow_n_layers"]), | |
flow_n_flows=int(hps["model"]["flow_n_flows"]), | |
sdp_hidden_channels=hps["model"]["sdp_filter_channels"], | |
sdp_kernel_size=hps["model"]["sdp_kernel_size"], | |
sdp_n_layers=int(hps["model"]["sdp_n_layers"]), | |
sdp_dropout=hps["model"]["sdp_dropout"], | |
sampling_rate=hps["data"]["sampling_rate"], | |
filter_length=1024, | |
hop_length=256, | |
win_length=1024, | |
).to(device) | |
# safetensorsで重み読み込み | |
from safetensors.torch import load_file | |
model_sd = load_file(model_path) | |
model.load_state_dict(model_sd, strict=True) | |
model.eval() | |
def synthesize_voice(text): | |
# 推論を実行 | |
x = torch.LongTensor(text_to_sequence(text, ['basic_cleaners'])).unsqueeze(0).to(device) | |
x_lengths = torch.LongTensor([x.size(1)]).to(device) | |
sid = torch.LongTensor([0]).to(device) | |
with torch.no_grad(): | |
audio = model.infer(x, x_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1.0)[0][0, 0].cpu().numpy() | |
return audio | |