import torch import os from huggingface_hub import hf_hub_download from src.sbv2.synthesizer_trn import SynthesizerTrn from src.sbv2.text import text_to_sequence from src.sbv2.commons import get_hparams_from_file # 環境変数から取得 MODEL_REPO = os.getenv("MODEL_REPO") HF_TOKEN = os.getenv("HF_TOKEN") CACHE_DIR = "/tmp/hf_cache" # モデルとデバイスをグローバル変数として用意 model = None device = torch.device("cuda" if torch.cuda.is_available() else "cpu") def load_model(): global model # Hugging Faceからモデルファイルをダウンロード config_path = hf_hub_download(repo_id=MODEL_REPO, filename="config.json", token=HF_TOKEN, cache_dir=CACHE_DIR) model_path = hf_hub_download(repo_id=MODEL_REPO, filename="model.safetensors", token=HF_TOKEN, cache_dir=CACHE_DIR) style_path = hf_hub_download(repo_id=MODEL_REPO, filename="style_vectors.npy", token=HF_TOKEN, cache_dir=CACHE_DIR) # configをロード hps = get_hparams_from_file(config_path) # モデルを初期化 model = SynthesizerTrn( n_vocab=70, # 仮設定(※symbolsが無いため一般的な日本語TTS想定) spec_channels=hps["model"].get("spec_channels", 80), segment_size=None, inter_channels=hps["model"]["hidden_channels"], hidden_channels=hps["model"]["hidden_channels"], filter_channels=hps["model"]["filter_channels"], n_heads=hps["model"]["n_heads"], n_layers=int(hps["model"]["encoder_n_layers"]), kernel_size=hps["model"]["encoder_kernel_size"], p_dropout=hps["model"]["dropout"], resblock=str(hps["model"].get("resblock", 2)), resblock_kernel_sizes=hps["model"]["resblock_kernel_sizes"], resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5], [1, 3, 5]], upsample_rates=hps["model"]["upsample_rates"], upsample_initial_channel=512, # 通常512固定 upsample_kernel_sizes=hps["model"]["upsample_kernel_sizes"], gin_channels=hps["model"]["gin_channels"], out_channels=hps["model"].get("spec_channels", 80), dec_kernel_size=hps["model"]["encoder_kernel_size"], enc_channels=hps["model"]["encoder_hidden"], enc_out_channels=hps["model"]["encoder_hidden"] * 2, enc_kernel_size=hps["model"]["encoder_kernel_size"], enc_dilation_rate=hps["model"].get("enc_dilation_rate", 1), enc_n_layers=int(hps["model"]["encoder_n_layers"]), flow_hidden_channels=hps["model"]["hidden_channels"], flow_kernel_size=hps["model"]["flow_kernel_size"], flow_n_layers=int(hps["model"]["flow_n_layers"]), flow_n_flows=int(hps["model"]["flow_n_flows"]), sdp_hidden_channels=hps["model"]["sdp_filter_channels"], sdp_kernel_size=hps["model"]["sdp_kernel_size"], sdp_n_layers=int(hps["model"]["sdp_n_layers"]), sdp_dropout=hps["model"]["sdp_dropout"], sampling_rate=hps["data"]["sampling_rate"], filter_length=1024, hop_length=256, win_length=1024, ).to(device) # safetensorsで重み読み込み from safetensors.torch import load_file model_sd = load_file(model_path) model.load_state_dict(model_sd, strict=True) model.eval() def synthesize_voice(text): # 推論を実行 x = torch.LongTensor(text_to_sequence(text, ['basic_cleaners'])).unsqueeze(0).to(device) x_lengths = torch.LongTensor([x.size(1)]).to(device) sid = torch.LongTensor([0]).to(device) with torch.no_grad(): audio = model.infer(x, x_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1.0)[0][0, 0].cpu().numpy() return audio