Spaces:

buchi-stdesign
/

style-bert-vits2-fastapi

Runtime error

File size: 3,695 Bytes

1ee91f8

import torch
import os
from huggingface_hub import hf_hub_download
from src.sbv2.synthesizer_trn import SynthesizerTrn
from src.sbv2.text import text_to_sequence
from src.sbv2.commons import get_hparams_from_file

# 環境変数から取得
MODEL_REPO = os.getenv("MODEL_REPO")
HF_TOKEN = os.getenv("HF_TOKEN")
CACHE_DIR = "/tmp/hf_cache"

# モデルとデバイスをグローバル変数として用意
model = None
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_model():
    global model
    # Hugging Faceからモデルファイルをダウンロード
    config_path = hf_hub_download(repo_id=MODEL_REPO, filename="config.json", token=HF_TOKEN, cache_dir=CACHE_DIR)
    model_path = hf_hub_download(repo_id=MODEL_REPO, filename="model.safetensors", token=HF_TOKEN, cache_dir=CACHE_DIR)
    style_path = hf_hub_download(repo_id=MODEL_REPO, filename="style_vectors.npy", token=HF_TOKEN, cache_dir=CACHE_DIR)

    # configをロード
    hps = get_hparams_from_file(config_path)

    # モデルを初期化
    model = SynthesizerTrn(
        n_vocab=70,  # 仮設定（※symbolsが無いため一般的な日本語TTS想定）
        spec_channels=hps["model"].get("spec_channels", 80),
        segment_size=None,
        inter_channels=hps["model"]["hidden_channels"],
        hidden_channels=hps["model"]["hidden_channels"],
        filter_channels=hps["model"]["filter_channels"],
        n_heads=hps["model"]["n_heads"],
        n_layers=int(hps["model"]["encoder_n_layers"]),
        kernel_size=hps["model"]["encoder_kernel_size"],
        p_dropout=hps["model"]["dropout"],
        resblock=str(hps["model"].get("resblock", 2)),
        resblock_kernel_sizes=hps["model"]["resblock_kernel_sizes"],
        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5], [1, 3, 5]],
        upsample_rates=hps["model"]["upsample_rates"],
        upsample_initial_channel=512,  # 通常512固定
        upsample_kernel_sizes=hps["model"]["upsample_kernel_sizes"],
        gin_channels=hps["model"]["gin_channels"],
        out_channels=hps["model"].get("spec_channels", 80),
        dec_kernel_size=hps["model"]["encoder_kernel_size"],
        enc_channels=hps["model"]["encoder_hidden"],
        enc_out_channels=hps["model"]["encoder_hidden"] * 2,
        enc_kernel_size=hps["model"]["encoder_kernel_size"],
        enc_dilation_rate=hps["model"].get("enc_dilation_rate", 1),
        enc_n_layers=int(hps["model"]["encoder_n_layers"]),
        flow_hidden_channels=hps["model"]["hidden_channels"],
        flow_kernel_size=hps["model"]["flow_kernel_size"],
        flow_n_layers=int(hps["model"]["flow_n_layers"]),
        flow_n_flows=int(hps["model"]["flow_n_flows"]),
        sdp_hidden_channels=hps["model"]["sdp_filter_channels"],
        sdp_kernel_size=hps["model"]["sdp_kernel_size"],
        sdp_n_layers=int(hps["model"]["sdp_n_layers"]),
        sdp_dropout=hps["model"]["sdp_dropout"],
        sampling_rate=hps["data"]["sampling_rate"],
        filter_length=1024,
        hop_length=256,
        win_length=1024,
    ).to(device)

    # safetensorsで重み読み込み
    from safetensors.torch import load_file
    model_sd = load_file(model_path)
    model.load_state_dict(model_sd, strict=True)
    model.eval()

def synthesize_voice(text):
    # 推論を実行
    x = torch.LongTensor(text_to_sequence(text, ['basic_cleaners'])).unsqueeze(0).to(device)
    x_lengths = torch.LongTensor([x.size(1)]).to(device)
    sid = torch.LongTensor([0]).to(device)

    with torch.no_grad():
        audio = model.infer(x, x_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1.0)[0][0, 0].cpu().numpy()
    return audio