Spaces:

buchi-stdesign
/

style-bert-vits2-fastapi

Runtime error

App Files Files Community

style-bert-vits2-fastapi / inference.py

buchi-stdesign

Upload 18 files

1ee91f8 verified 25 days ago

raw

history blame

3.7 kB

	import torch
	import os
	from huggingface_hub import hf_hub_download
	from src.sbv2.synthesizer_trn import SynthesizerTrn
	from src.sbv2.text import text_to_sequence
	from src.sbv2.commons import get_hparams_from_file

	# 環境変数から取得
	MODEL_REPO = os.getenv("MODEL_REPO")
	HF_TOKEN = os.getenv("HF_TOKEN")
	CACHE_DIR = "/tmp/hf_cache"

	# モデルとデバイスをグローバル変数として用意
	model = None
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	def load_model():
	global model
	# Hugging Faceからモデルファイルをダウンロード
	config_path = hf_hub_download(repo_id=MODEL_REPO, filename="config.json", token=HF_TOKEN, cache_dir=CACHE_DIR)
	model_path = hf_hub_download(repo_id=MODEL_REPO, filename="model.safetensors", token=HF_TOKEN, cache_dir=CACHE_DIR)
	style_path = hf_hub_download(repo_id=MODEL_REPO, filename="style_vectors.npy", token=HF_TOKEN, cache_dir=CACHE_DIR)

	# configをロード
	hps = get_hparams_from_file(config_path)

	# モデルを初期化
	model = SynthesizerTrn(
	n_vocab=70, # 仮設定（※symbolsが無いため一般的な日本語TTS想定）
	spec_channels=hps["model"].get("spec_channels", 80),
	segment_size=None,
	inter_channels=hps["model"]["hidden_channels"],
	hidden_channels=hps["model"]["hidden_channels"],
	filter_channels=hps["model"]["filter_channels"],
	n_heads=hps["model"]["n_heads"],
	n_layers=int(hps["model"]["encoder_n_layers"]),
	kernel_size=hps["model"]["encoder_kernel_size"],
	p_dropout=hps["model"]["dropout"],
	resblock=str(hps["model"].get("resblock", 2)),
	resblock_kernel_sizes=hps["model"]["resblock_kernel_sizes"],
	resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5], [1, 3, 5]],
	upsample_rates=hps["model"]["upsample_rates"],
	upsample_initial_channel=512, # 通常512固定
	upsample_kernel_sizes=hps["model"]["upsample_kernel_sizes"],
	gin_channels=hps["model"]["gin_channels"],
	out_channels=hps["model"].get("spec_channels", 80),
	dec_kernel_size=hps["model"]["encoder_kernel_size"],
	enc_channels=hps["model"]["encoder_hidden"],
	enc_out_channels=hps["model"]["encoder_hidden"] * 2,
	enc_kernel_size=hps["model"]["encoder_kernel_size"],
	enc_dilation_rate=hps["model"].get("enc_dilation_rate", 1),
	enc_n_layers=int(hps["model"]["encoder_n_layers"]),
	flow_hidden_channels=hps["model"]["hidden_channels"],
	flow_kernel_size=hps["model"]["flow_kernel_size"],
	flow_n_layers=int(hps["model"]["flow_n_layers"]),
	flow_n_flows=int(hps["model"]["flow_n_flows"]),
	sdp_hidden_channels=hps["model"]["sdp_filter_channels"],
	sdp_kernel_size=hps["model"]["sdp_kernel_size"],
	sdp_n_layers=int(hps["model"]["sdp_n_layers"]),
	sdp_dropout=hps["model"]["sdp_dropout"],
	sampling_rate=hps["data"]["sampling_rate"],
	filter_length=1024,
	hop_length=256,
	win_length=1024,
	).to(device)

	# safetensorsで重み読み込み
	from safetensors.torch import load_file
	model_sd = load_file(model_path)
	model.load_state_dict(model_sd, strict=True)
	model.eval()

	def synthesize_voice(text):
	# 推論を実行
	x = torch.LongTensor(text_to_sequence(text, ['basic_cleaners'])).unsqueeze(0).to(device)
	x_lengths = torch.LongTensor([x.size(1)]).to(device)
	sid = torch.LongTensor([0]).to(device)

	with torch.no_grad():
	audio = model.infer(x, x_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1.0)[0][0, 0].cpu().numpy()
	return audio