File size: 3,171 Bytes
d9f2f00
 
 
94766d1
 
 
d9f2f00
94766d1
d9f2f00
94766d1
d9f2f00
 
 
 
 
 
94766d1
 
 
 
d9f2f00
 
 
94766d1
d9f2f00
 
 
 
94766d1
 
d9f2f00
 
 
94766d1
61f8b10
d9f2f00
 
10ed610
94766d1
61f8b10
d9f2f00
94766d1
d9f2f00
 
94766d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d9f2f00
 
94766d1
 
d9f2f00
 
94766d1
 
 
 
 
d9f2f00
94766d1
d9f2f00
94766d1
d9f2f00
94766d1
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import torch
import numpy as np
import soundfile as sf

from fastapi import FastAPI
from huggingface_hub import hf_hub_download

from src.sbv2 import utils
from src.sbv2.synthesizer_trn import SynthesizerTrn
from src.sbv2.text import text_to_sequence

MODEL_REPO = os.getenv("MODEL_REPO")
HF_TOKEN = os.getenv("HF_TOKEN")
CACHE_DIR = "/tmp/models"

app = FastAPI()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_model():
    global model, hps

    # config.json と model.safetensors と style_vectors.npy をダウンロード
    config_path = hf_hub_download(repo_id=MODEL_REPO, filename="config.json", token=HF_TOKEN, cache_dir=CACHE_DIR)
    model_path = hf_hub_download(repo_id=MODEL_REPO, filename="model.safetensors", token=HF_TOKEN, cache_dir=CACHE_DIR)
    style_path = hf_hub_download(repo_id=MODEL_REPO, filename="style_vectors.npy", token=HF_TOKEN, cache_dir=CACHE_DIR)

    # configロード
    import json
    with open(config_path, "r", encoding="utf-8") as f:
        hps = json.load(f)

    n_vocab = 77  # 小春音アミ用 symbol数
    segment_size = 8192  # 通常固定値、Style-BERT-VITS2推奨

    model = SynthesizerTrn(
        n_vocab,
        hps["model"]["p_dropout"],
        segment_size // 2, 
        hps["model"]["inter_channels"],
        hps["model"]["out_channels"],
        hps["model"]["hidden_channels"],
        hps["model"]["filter_channels"],
        hps["model"]["dec_kernel_size"],
        hps["model"]["enc_channels"],
        hps["model"]["enc_out_channels"],
        hps["model"]["enc_kernel_size"],
        hps["model"]["enc_dilation_rate"],
        hps["model"]["enc_n_layers"],
        hps["model"]["flow_hidden_channels"],
        hps["model"]["flow_kernel_size"],
        hps["model"]["flow_n_layers"],
        hps["model"]["flow_n_flows"],
        hps["model"]["sdp_hidden_channels"],
        hps["model"]["sdp_kernel_size"],
        hps["model"]["sdp_n_layers"],
        hps["model"]["sdp_dropout"],
        hps["audio"]["sampling_rate"],
        hps["audio"]["filter_length"],
        hps["audio"]["hop_length"],
        hps["audio"]["win_length"],
        hps["model"]["resblock"],
        hps["model"]["resblock_kernel_sizes"],
        hps["model"]["resblock_dilation_sizes"],
        hps["model"]["upsample_rates"],
        hps["model"]["upsample_initial_channel"],
        hps["model"]["upsample_kernel_sizes"],
        hps["model"].get("gin_channels", 0)
    ).to(device)

    # safetensorsロード
    utils.load_checkpoint(model_path, model, strict=True)
    model.eval()

@app.get("/voice")
def synthesize(text: str):
    # テキストを音素に変換
    sequence = np.array(text_to_sequence(text, hps["data"]["text_cleaners"]), dtype=np.int64)
    sequence = torch.LongTensor(sequence).unsqueeze(0).to(device)

    # 推論
    with torch.no_grad():
        audio = model.infer(sequence, noise_scale=0.667, noise_scale_w=0.8, length_scale=1.0)[0][0, 0].data.cpu().numpy()

    # 一時WAVファイル保存
    output_path = "/tmp/output.wav"
    sf.write(output_path, audio, hps["audio"]["sampling_rate"])
    
    return {"audio_path": output_path}