Spaces:
Runtime error
Runtime error
File size: 3,171 Bytes
d9f2f00 94766d1 d9f2f00 94766d1 d9f2f00 94766d1 d9f2f00 94766d1 d9f2f00 94766d1 d9f2f00 94766d1 d9f2f00 94766d1 61f8b10 d9f2f00 10ed610 94766d1 61f8b10 d9f2f00 94766d1 d9f2f00 94766d1 d9f2f00 94766d1 d9f2f00 94766d1 d9f2f00 94766d1 d9f2f00 94766d1 d9f2f00 94766d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import os
import torch
import numpy as np
import soundfile as sf
from fastapi import FastAPI
from huggingface_hub import hf_hub_download
from src.sbv2 import utils
from src.sbv2.synthesizer_trn import SynthesizerTrn
from src.sbv2.text import text_to_sequence
MODEL_REPO = os.getenv("MODEL_REPO")
HF_TOKEN = os.getenv("HF_TOKEN")
CACHE_DIR = "/tmp/models"
app = FastAPI()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def load_model():
global model, hps
# config.json と model.safetensors と style_vectors.npy をダウンロード
config_path = hf_hub_download(repo_id=MODEL_REPO, filename="config.json", token=HF_TOKEN, cache_dir=CACHE_DIR)
model_path = hf_hub_download(repo_id=MODEL_REPO, filename="model.safetensors", token=HF_TOKEN, cache_dir=CACHE_DIR)
style_path = hf_hub_download(repo_id=MODEL_REPO, filename="style_vectors.npy", token=HF_TOKEN, cache_dir=CACHE_DIR)
# configロード
import json
with open(config_path, "r", encoding="utf-8") as f:
hps = json.load(f)
n_vocab = 77 # 小春音アミ用 symbol数
segment_size = 8192 # 通常固定値、Style-BERT-VITS2推奨
model = SynthesizerTrn(
n_vocab,
hps["model"]["p_dropout"],
segment_size // 2,
hps["model"]["inter_channels"],
hps["model"]["out_channels"],
hps["model"]["hidden_channels"],
hps["model"]["filter_channels"],
hps["model"]["dec_kernel_size"],
hps["model"]["enc_channels"],
hps["model"]["enc_out_channels"],
hps["model"]["enc_kernel_size"],
hps["model"]["enc_dilation_rate"],
hps["model"]["enc_n_layers"],
hps["model"]["flow_hidden_channels"],
hps["model"]["flow_kernel_size"],
hps["model"]["flow_n_layers"],
hps["model"]["flow_n_flows"],
hps["model"]["sdp_hidden_channels"],
hps["model"]["sdp_kernel_size"],
hps["model"]["sdp_n_layers"],
hps["model"]["sdp_dropout"],
hps["audio"]["sampling_rate"],
hps["audio"]["filter_length"],
hps["audio"]["hop_length"],
hps["audio"]["win_length"],
hps["model"]["resblock"],
hps["model"]["resblock_kernel_sizes"],
hps["model"]["resblock_dilation_sizes"],
hps["model"]["upsample_rates"],
hps["model"]["upsample_initial_channel"],
hps["model"]["upsample_kernel_sizes"],
hps["model"].get("gin_channels", 0)
).to(device)
# safetensorsロード
utils.load_checkpoint(model_path, model, strict=True)
model.eval()
@app.get("/voice")
def synthesize(text: str):
# テキストを音素に変換
sequence = np.array(text_to_sequence(text, hps["data"]["text_cleaners"]), dtype=np.int64)
sequence = torch.LongTensor(sequence).unsqueeze(0).to(device)
# 推論
with torch.no_grad():
audio = model.infer(sequence, noise_scale=0.667, noise_scale_w=0.8, length_scale=1.0)[0][0, 0].data.cpu().numpy()
# 一時WAVファイル保存
output_path = "/tmp/output.wav"
sf.write(output_path, audio, hps["audio"]["sampling_rate"])
return {"audio_path": output_path}
|