Spaces:

buchi-stdesign
/

style-bert-vits2-fastapi

Runtime error

File size: 3,171 Bytes

import os
import torch
import numpy as np
import soundfile as sf

from fastapi import FastAPI
from huggingface_hub import hf_hub_download

from src.sbv2 import utils
from src.sbv2.synthesizer_trn import SynthesizerTrn
from src.sbv2.text import text_to_sequence

MODEL_REPO = os.getenv("MODEL_REPO")
HF_TOKEN = os.getenv("HF_TOKEN")
CACHE_DIR = "/tmp/models"

app = FastAPI()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_model():
    global model, hps

    # config.json と model.safetensors と style_vectors.npy をダウンロード
    config_path = hf_hub_download(repo_id=MODEL_REPO, filename="config.json", token=HF_TOKEN, cache_dir=CACHE_DIR)
    model_path = hf_hub_download(repo_id=MODEL_REPO, filename="model.safetensors", token=HF_TOKEN, cache_dir=CACHE_DIR)
    style_path = hf_hub_download(repo_id=MODEL_REPO, filename="style_vectors.npy", token=HF_TOKEN, cache_dir=CACHE_DIR)

    # configロード
    import json
    with open(config_path, "r", encoding="utf-8") as f:
        hps = json.load(f)

    n_vocab = 77  # 小春音アミ用 symbol数
    segment_size = 8192  # 通常固定値、Style-BERT-VITS2推奨

    model = SynthesizerTrn(
        n_vocab,
        hps["model"]["p_dropout"],
        segment_size // 2, 
        hps["model"]["inter_channels"],
        hps["model"]["out_channels"],
        hps["model"]["hidden_channels"],
        hps["model"]["filter_channels"],
        hps["model"]["dec_kernel_size"],
        hps["model"]["enc_channels"],
        hps["model"]["enc_out_channels"],
        hps["model"]["enc_kernel_size"],
        hps["model"]["enc_dilation_rate"],
        hps["model"]["enc_n_layers"],
        hps["model"]["flow_hidden_channels"],
        hps["model"]["flow_kernel_size"],
        hps["model"]["flow_n_layers"],
        hps["model"]["flow_n_flows"],
        hps["model"]["sdp_hidden_channels"],
        hps["model"]["sdp_kernel_size"],
        hps["model"]["sdp_n_layers"],
        hps["model"]["sdp_dropout"],
        hps["audio"]["sampling_rate"],
        hps["audio"]["filter_length"],
        hps["audio"]["hop_length"],
        hps["audio"]["win_length"],
        hps["model"]["resblock"],
        hps["model"]["resblock_kernel_sizes"],
        hps["model"]["resblock_dilation_sizes"],
        hps["model"]["upsample_rates"],
        hps["model"]["upsample_initial_channel"],
        hps["model"]["upsample_kernel_sizes"],
        hps["model"].get("gin_channels", 0)
    ).to(device)

    # safetensorsロード
    utils.load_checkpoint(model_path, model, strict=True)
    model.eval()

@app.get("/voice")
def synthesize(text: str):
    # テキストを音素に変換
    sequence = np.array(text_to_sequence(text, hps["data"]["text_cleaners"]), dtype=np.int64)
    sequence = torch.LongTensor(sequence).unsqueeze(0).to(device)

    # 推論
    with torch.no_grad():
        audio = model.infer(sequence, noise_scale=0.667, noise_scale_w=0.8, length_scale=1.0)[0][0, 0].data.cpu().numpy()

    # 一時WAVファイル保存
    output_path = "/tmp/output.wav"
    sf.write(output_path, audio, hps["audio"]["sampling_rate"])
    
    return {"audio_path": output_path}