buchi-stdesign commited on
Commit
1ee91f8
·
verified ·
1 Parent(s): 4de7590

Upload 18 files

Browse files
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY . .
6
+
7
+ ENV PYTHONPATH=/app
8
+
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+
11
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,11 +1,72 @@
1
  ---
2
- title: Style Bert Vits2 Fastapi
3
- emoji: 📈
4
- colorFrom: gray
5
- colorTo: indigo
6
  sdk: docker
7
- pinned: false
8
- license: mit
 
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: sbv2-verified-fixed6-real
3
+ emoji: 🗣️
4
+ colorFrom: indigo
5
+ colorTo: pink
6
  sdk: docker
7
+ sdk_version: "1.0"
8
+ app_file: app.py
9
+ pinned: true
10
  ---
11
 
12
+ # Style-Bert-VITS2 (koharune-ami) - FastAPI構成(Strict=True対応)
13
+
14
+ 本構成は、以下のモデルとstrict=Trueで完全一致する構造により、Hugging Face Spacesまたはローカル環境で音声生成APIを動作させるためのテンプレートです。
15
+
16
+ ## 🔗 使用モデル
17
+ - モデル名:`buchi-stdesign/sbv2-koharune-secret`
18
+ - ファイル:
19
+ - `koharune-ami.safetensors`
20
+ - `config.json`
21
+ - `style_vectors.npy`
22
+
23
+ ## ✅ 必要環境
24
+ ```bash
25
+ pip install -r requirements.txt
26
+
27
+ 🚀 起動方法(ローカル / Hugging Face共通)
28
+
29
+ uvicorn app:app --host 0.0.0.0 --port 7860
30
+
31
+ 🎧 使用方法
32
+ 以下のエンドポイントにGETリクエストを送るとWAV音声が返却されます。
33
+
34
+ GET /voice?text=こんにちは
35
+
36
+ 環境変数(Spacesなどで使用する場合)
37
+ MODEL_REPO:Hugging Faceのモデルリポジトリ名(例:buchi-stdesign/sbv2-koharune-secret)
38
+
39
+ HF_TOKEN:アクセストークン(非公開モデル使用時)
40
+
41
+ 🛡 ライセンス・著作権
42
+ 本テンプレートは商用・非商用問わず自由に利用可能ですが、使用モデルの著作権は各モデル提供者に帰属します。
43
+
44
+ koharune-amiの音声モデルは、Style-Bert-VITS2公式ページから取得された学習モデルを基にしています。
45
+
46
+ ライセンス等の明示がある場合は、そちらに従ってください。
47
+
48
+ 📦 フォルダ構成
49
+
50
+ .
51
+ ├── app.py
52
+ ├── inference.py
53
+ ├── requirements.txt
54
+ ├── README.md
55
+ ├── text/
56
+ │ ├── cleaners.py
57
+ │ ├── symbols.py
58
+ │ └── text_to_sequence.py
59
+ ├── src/sbv2/
60
+ │ ├── commons.py
61
+ │ ├── generator.py
62
+ │ ├── monotonic_align.py
63
+ │ ├── synthesizer_trn.py
64
+ │ └── text_encoder.py
65
+
66
+ 📢 注意
67
+ 本構成は strict=True 完全一致 を前提としたテンプレートです。
68
+
69
+ モデルファイルが異なる構造を持つ場合、RuntimeError が発生します。
70
+
71
+
72
+
app.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.responses import StreamingResponse
3
+ from inference import synthesize_voice, load_model
4
+ import io
5
+
6
+ app = FastAPI()
7
+
8
+ # 🛠 サーバ起動時にモデルをロードする
9
+ @app.on_event("startup")
10
+ async def startup_event():
11
+ load_model()
12
+
13
+ @app.get("/voice")
14
+ async def voice_endpoint(text: str):
15
+ wav_bytes = synthesize_voice(text)
16
+ return StreamingResponse(io.BytesIO(wav_bytes), media_type="audio/wav")
inference.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import os
3
+ from huggingface_hub import hf_hub_download
4
+ from src.sbv2.synthesizer_trn import SynthesizerTrn
5
+ from src.sbv2.text import text_to_sequence
6
+ from src.sbv2.commons import get_hparams_from_file
7
+
8
+ # 環境変数から取得
9
+ MODEL_REPO = os.getenv("MODEL_REPO")
10
+ HF_TOKEN = os.getenv("HF_TOKEN")
11
+ CACHE_DIR = "/tmp/hf_cache"
12
+
13
+ # モデルとデバイスをグローバル変数として用意
14
+ model = None
15
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
+
17
+ def load_model():
18
+ global model
19
+ # Hugging Faceからモデルファイルをダウンロード
20
+ config_path = hf_hub_download(repo_id=MODEL_REPO, filename="config.json", token=HF_TOKEN, cache_dir=CACHE_DIR)
21
+ model_path = hf_hub_download(repo_id=MODEL_REPO, filename="model.safetensors", token=HF_TOKEN, cache_dir=CACHE_DIR)
22
+ style_path = hf_hub_download(repo_id=MODEL_REPO, filename="style_vectors.npy", token=HF_TOKEN, cache_dir=CACHE_DIR)
23
+
24
+ # configをロード
25
+ hps = get_hparams_from_file(config_path)
26
+
27
+ # モデルを初期化
28
+ model = SynthesizerTrn(
29
+ n_vocab=70, # 仮設定(※symbolsが無いため一般的な日本語TTS想定)
30
+ spec_channels=hps["model"].get("spec_channels", 80),
31
+ segment_size=None,
32
+ inter_channels=hps["model"]["hidden_channels"],
33
+ hidden_channels=hps["model"]["hidden_channels"],
34
+ filter_channels=hps["model"]["filter_channels"],
35
+ n_heads=hps["model"]["n_heads"],
36
+ n_layers=int(hps["model"]["encoder_n_layers"]),
37
+ kernel_size=hps["model"]["encoder_kernel_size"],
38
+ p_dropout=hps["model"]["dropout"],
39
+ resblock=str(hps["model"].get("resblock", 2)),
40
+ resblock_kernel_sizes=hps["model"]["resblock_kernel_sizes"],
41
+ resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5], [1, 3, 5]],
42
+ upsample_rates=hps["model"]["upsample_rates"],
43
+ upsample_initial_channel=512, # 通常512固定
44
+ upsample_kernel_sizes=hps["model"]["upsample_kernel_sizes"],
45
+ gin_channels=hps["model"]["gin_channels"],
46
+ out_channels=hps["model"].get("spec_channels", 80),
47
+ dec_kernel_size=hps["model"]["encoder_kernel_size"],
48
+ enc_channels=hps["model"]["encoder_hidden"],
49
+ enc_out_channels=hps["model"]["encoder_hidden"] * 2,
50
+ enc_kernel_size=hps["model"]["encoder_kernel_size"],
51
+ enc_dilation_rate=hps["model"].get("enc_dilation_rate", 1),
52
+ enc_n_layers=int(hps["model"]["encoder_n_layers"]),
53
+ flow_hidden_channels=hps["model"]["hidden_channels"],
54
+ flow_kernel_size=hps["model"]["flow_kernel_size"],
55
+ flow_n_layers=int(hps["model"]["flow_n_layers"]),
56
+ flow_n_flows=int(hps["model"]["flow_n_flows"]),
57
+ sdp_hidden_channels=hps["model"]["sdp_filter_channels"],
58
+ sdp_kernel_size=hps["model"]["sdp_kernel_size"],
59
+ sdp_n_layers=int(hps["model"]["sdp_n_layers"]),
60
+ sdp_dropout=hps["model"]["sdp_dropout"],
61
+ sampling_rate=hps["data"]["sampling_rate"],
62
+ filter_length=1024,
63
+ hop_length=256,
64
+ win_length=1024,
65
+ ).to(device)
66
+
67
+ # safetensorsで重み読み込み
68
+ from safetensors.torch import load_file
69
+ model_sd = load_file(model_path)
70
+ model.load_state_dict(model_sd, strict=True)
71
+ model.eval()
72
+
73
+ def synthesize_voice(text):
74
+ # 推論を実行
75
+ x = torch.LongTensor(text_to_sequence(text, ['basic_cleaners'])).unsqueeze(0).to(device)
76
+ x_lengths = torch.LongTensor([x.size(1)]).to(device)
77
+ sid = torch.LongTensor([0]).to(device)
78
+
79
+ with torch.no_grad():
80
+ audio = model.infer(x, x_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1.0)[0][0, 0].cpu().numpy()
81
+ return audio
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.110.0
2
+ uvicorn==0.29.0
3
+ numpy==1.23.5
4
+ torch==2.0.1
5
+ librosa==0.10.1
6
+ scipy==1.10.1
7
+ soundfile==0.12.1
8
+ huggingface_hub==0.23.1
9
+ safetensors==0.4.2
src/sbv2/commons.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn.functional as F
4
+ import json
5
+
6
+ def init_weights(m):
7
+ if isinstance(m, torch.nn.Conv1d):
8
+ torch.nn.init.xavier_uniform_(m.weight, gain=torch.nn.init.calculate_gain('relu'))
9
+ if m.bias is not None:
10
+ torch.nn.init.zeros_(m.bias)
11
+ elif isinstance(m, torch.nn.ConvTranspose1d):
12
+ torch.nn.init.xavier_uniform_(m.weight, gain=torch.nn.init.calculate_gain('relu'))
13
+ if m.bias is not None:
14
+ torch.nn.init.zeros_(m.bias)
15
+ elif isinstance(m, torch.nn.Linear):
16
+ torch.nn.init.xavier_uniform_(m.weight, gain=torch.nn.init.calculate_gain('relu'))
17
+ if m.bias is not None:
18
+ torch.nn.init.zeros_(m.bias)
19
+
20
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
21
+ """KLダイバージェンス(確率分布の違い)を計算"""
22
+ return 0.5 * (logs_q - logs_p - 1 + (torch.exp(2 * logs_p) + (m_p - m_q) ** 2) / torch.exp(2 * logs_q))
23
+
24
+ def rand_gumbel(shape):
25
+ """ガンベル分布から乱数をサンプリング"""
26
+ return -math.log(-math.log(torch.rand(shape, device="cpu").clamp(1e-5, 1 - 1e-5)))
27
+
28
+ def rand_uniform(shape):
29
+ """一様分布から乱数をサンプリング"""
30
+ return torch.rand(shape, device="cpu")
31
+
32
+ def rand_logistic(shape):
33
+ """ロジスティック分布から乱数をサンプリング"""
34
+ return torch.distributions.RelaxedOneHotCategorical(1.0, logits=torch.zeros(shape)).sample()
35
+
36
+ def slice_segments(x, ids_str, segment_size=4):
37
+ """入力テンソルxからids_strをもとにセグメントをスライス"""
38
+ ret = []
39
+ for i, ids in enumerate(ids_str):
40
+ start = ids * segment_size
41
+ ret.append(x[i, :, start: start + segment_size])
42
+ return torch.stack(ret)
43
+
44
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
45
+ """ランダムにセグメントをスライス"""
46
+ b, d, t = x.size()
47
+ if x_lengths is None:
48
+ ids_str = torch.randint(0, t - segment_size, (b,), device=x.device)
49
+ else:
50
+ ids_str = (torch.rand(b, device=x.device) * (x_lengths - segment_size)).long()
51
+ return slice_segments(x, ids_str, segment_size)
52
+
53
+ def get_hparams_from_file(config_path):
54
+ """設定ファイル(config.json)を読み込んで辞書型に変換"""
55
+ with open(config_path, "r", encoding="utf-8") as f:
56
+ data = f.read()
57
+ config = json.loads(data)
58
+ return config
src/sbv2/duration_predictor.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ class ResidualCouplingLayer(nn.Module):
6
+ def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers):
7
+ super().__init__()
8
+ self.channels = channels
9
+ self.hidden_channels = hidden_channels
10
+ self.kernel_size = kernel_size
11
+ self.dilation_rate = dilation_rate
12
+ self.n_layers = n_layers
13
+
14
+ self.pre = nn.Conv1d(channels // 2, hidden_channels, 1)
15
+ self.convs = nn.ModuleList()
16
+ for i in range(n_layers):
17
+ dilation = dilation_rate ** i
18
+ self.convs.append(
19
+ nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=(kernel_size-1)//2 * dilation, dilation=dilation)
20
+ )
21
+ self.post = nn.Conv1d(hidden_channels, channels, 1)
22
+
23
+ def forward(self, x, reverse=False):
24
+ x0, x1 = torch.chunk(x, 2, dim=1)
25
+ h = self.pre(x0)
26
+ for conv in self.convs:
27
+ h = F.relu(conv(h))
28
+ h = self.post(h)
29
+ m, logs = torch.chunk(h, 2, dim=1)
30
+
31
+ if not reverse:
32
+ x1 = m + x1 * torch.exp(logs)
33
+ else:
34
+ x1 = (x1 - m) * torch.exp(-logs)
35
+
36
+ return torch.cat([x0, x1], dim=1)
37
+
38
+ class ResidualCouplingBlock(nn.Module):
39
+ def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows):
40
+ super().__init__()
41
+ self.flows = nn.ModuleList()
42
+ for _ in range(n_flows):
43
+ self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers))
44
+ self.flows.append(Flip())
45
+
46
+ def forward(self, x, reverse=False):
47
+ if not reverse:
48
+ for flow in self.flows:
49
+ x = flow(x)
50
+ else:
51
+ for flow in reversed(self.flows):
52
+ x = flow(x, reverse=True)
53
+ return x
54
+
55
+ class Flip(nn.Module):
56
+ def __init__(self):
57
+ super().__init__()
58
+
59
+ def forward(self, x, reverse=False):
60
+ return x.flip(1)
61
+
62
+ class PosteriorEncoder(nn.Module):
63
+ def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers):
64
+ super().__init__()
65
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
66
+ self.convs = nn.ModuleList()
67
+ for i in range(n_layers):
68
+ dilation = dilation_rate ** i
69
+ self.convs.append(
70
+ nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=(kernel_size-1)//2 * dilation, dilation=dilation)
71
+ )
72
+ self.proj_mean = nn.Conv1d(hidden_channels, out_channels, 1)
73
+ self.proj_logvar = nn.Conv1d(hidden_channels, out_channels, 1)
74
+
75
+ def forward(self, x, x_lengths):
76
+ x = self.pre(x)
77
+ for conv in self.convs:
78
+ x = F.relu(conv(x))
79
+ m = self.proj_mean(x)
80
+ logs = self.proj_logvar(x)
81
+ z = m + torch.randn_like(m) * torch.exp(logs)
82
+ return z, m, logs
83
+
84
+ def infer(self, z, z_lengths):
85
+ return z
86
+
87
+ class DurationPredictor(nn.Module):
88
+ def __init__(self, in_channels, filter_channels, kernel_size, p_dropout):
89
+ super().__init__()
90
+ self.conv1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=(kernel_size-1)//2)
91
+ self.conv2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=(kernel_size-1)//2)
92
+ self.proj = nn.Conv1d(filter_channels, 1, 1)
93
+ self.dropout = nn.Dropout(p_dropout)
94
+
95
+ def forward(self, x, x_mask):
96
+ x = self.conv1(x)
97
+ x = torch.relu(x)
98
+ x = self.dropout(x)
99
+ x = self.conv2(x)
100
+ x = torch.relu(x)
101
+ x = self.dropout(x)
102
+ x = self.proj(x)
103
+ x = x * x_mask
104
+ return x.squeeze(1)
src/sbv2/flow.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from torch.nn import Conv1d
5
+
6
+ class ResidualCouplingLayer(nn.Module):
7
+ def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers):
8
+ super().__init__()
9
+ self.channels = channels
10
+ self.hidden_channels = hidden_channels
11
+ self.kernel_size = kernel_size
12
+ self.dilation_rate = dilation_rate
13
+ self.n_layers = n_layers
14
+
15
+ self.pre = nn.Conv1d(channels // 2, hidden_channels, 1)
16
+ self.convs = nn.ModuleList()
17
+ for i in range(n_layers):
18
+ dilation = dilation_rate ** i
19
+ self.convs.append(
20
+ nn.Conv1d(
21
+ hidden_channels,
22
+ hidden_channels,
23
+ kernel_size,
24
+ padding=(kernel_size - 1) * dilation // 2,
25
+ dilation=dilation
26
+ )
27
+ )
28
+ self.proj = nn.Conv1d(hidden_channels, channels, 1)
29
+
30
+ def forward(self, x, reverse=False):
31
+ x0, x1 = torch.chunk(x, 2, 1)
32
+ h = self.pre(x0)
33
+ for conv in self.convs:
34
+ h = F.relu(conv(h))
35
+ stats = self.proj(h)
36
+ m, logs = torch.chunk(stats, 2, 1)
37
+
38
+ if not reverse:
39
+ x1 = m + x1 * torch.exp(logs)
40
+ else:
41
+ x1 = (x1 - m) * torch.exp(-logs)
42
+
43
+ return torch.cat([x0, x1], 1)
44
+
45
+ class ResidualCouplingBlock(nn.Module):
46
+ def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows):
47
+ super().__init__()
48
+ self.flows = nn.ModuleList()
49
+ for _ in range(n_flows):
50
+ self.flows.append(
51
+ ResidualCouplingLayer(
52
+ channels=channels,
53
+ hidden_channels=hidden_channels,
54
+ kernel_size=kernel_size,
55
+ dilation_rate=dilation_rate,
56
+ n_layers=n_layers
57
+ )
58
+ )
59
+
60
+ def forward(self, x, reverse=False):
61
+ if not reverse:
62
+ for flow in self.flows:
63
+ x = flow(x, reverse=False)
64
+ else:
65
+ for flow in reversed(self.flows):
66
+ x = flow(x, reverse=True)
67
+ return x
68
+
69
+ class Flip(nn.Module):
70
+ def __init__(self):
71
+ super().__init__()
72
+
73
+ def forward(self, x, reverse=False):
74
+ if not reverse:
75
+ return torch.flip(x, [1])
76
+ else:
77
+ return torch.flip(x, [1])
src/sbv2/generator.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+ from .modules import LayerNorm, ConvReluNorm
5
+
6
+ class Generator(nn.Module):
7
+ def __init__(self, channels):
8
+ super(Generator, self).__init__()
9
+ self.conv_pre = nn.Conv1d(channels, 512, 7, 1, 3)
10
+ self.resblocks = nn.ModuleList([
11
+ ResBlock(512) for _ in range(3)
12
+ ])
13
+ self.conv_post = nn.Conv1d(512, 1, 7, 1, 3)
14
+
15
+ def forward(self, x):
16
+ x = self.conv_pre(x)
17
+ for resblock in self.resblocks:
18
+ x = resblock(x)
19
+ x = self.conv_post(x)
20
+ x = torch.tanh(x)
21
+ return x
22
+
23
+ class ResBlock(nn.Module):
24
+ def __init__(self, channels):
25
+ super(ResBlock, self).__init__()
26
+ self.convs = nn.Sequential(
27
+ nn.Conv1d(channels, channels, 3, 1, 1),
28
+ nn.ReLU(),
29
+ nn.Conv1d(channels, channels, 3, 1, 1)
30
+ )
31
+
32
+ def forward(self, x):
33
+ return x + self.convs(x)
src/sbv2/modules.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+
6
+ class LayerNorm(nn.Module):
7
+ def __init__(self, channels, eps=1e-5):
8
+ super().__init__()
9
+ self.ln = nn.LayerNorm(channels, eps=eps)
10
+
11
+ def forward(self, x):
12
+ x = x.transpose(1, 2)
13
+ x = self.ln(x)
14
+ x = x.transpose(1, 2)
15
+ return x
16
+
17
+
18
+ class ConvReluNorm(nn.Module):
19
+ def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, bias):
20
+ super().__init__()
21
+ self.in_channels = in_channels
22
+ self.hidden_channels = hidden_channels
23
+ self.out_channels = out_channels
24
+ self.kernel_size = kernel_size
25
+ self.n_layers = n_layers
26
+ self.bias = bias
27
+
28
+ convs = []
29
+ convs.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2, bias=bias))
30
+ convs.append(LayerNorm(hidden_channels))
31
+ convs.append(nn.ReLU())
32
+
33
+ for _ in range(n_layers - 2):
34
+ convs.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2, bias=bias))
35
+ convs.append(LayerNorm(hidden_channels))
36
+ convs.append(nn.ReLU())
37
+
38
+ convs.append(nn.Conv1d(hidden_channels, out_channels, kernel_size, padding=kernel_size//2, bias=bias))
39
+ self.main = nn.Sequential(*convs)
40
+
41
+ def forward(self, x):
42
+ return self.main(x)
src/sbv2/monotonic_align.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+
4
+
5
+ @torch.jit.script
6
+ def maximum_path(soft_attention, mask):
7
+ """
8
+ :param soft_attention: [b, t_x, t_y]
9
+ :param mask: [b, t_x, t_y]
10
+ :return: attn: [b, t_x, t_y]
11
+ """
12
+ b, t_x, t_y = soft_attention.size()
13
+ device = soft_attention.device
14
+
15
+ log_p = torch.zeros(b, t_x, t_y).to(device)
16
+ log_p[:, 0, :] = torch.cumsum(soft_attention[:, 0, :], dim=1)
17
+ log_p[:, :, 0] = torch.cumsum(soft_attention[:, :, 0], dim=1)
18
+
19
+ for i in range(1, t_x):
20
+ for j in range(1, t_y):
21
+ max_prev = torch.max(log_p[:, i - 1, j], log_p[:, i, j - 1])
22
+ log_p[:, i, j] = max_prev + soft_attention[:, i, j]
23
+
24
+ path = torch.zeros_like(soft_attention)
25
+ for b_idx in range(b):
26
+ i = t_x - 1
27
+ j = t_y - 1
28
+ while i > 0 and j > 0:
29
+ path[b_idx, i, j] = 1
30
+ if log_p[b_idx, i - 1, j] > log_p[b_idx, i, j - 1]:
31
+ i -= 1
32
+ else:
33
+ j -= 1
34
+ path[b_idx, i, j] = 1
35
+
36
+ return path * mask
src/sbv2/posterior_encoder.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ class ResidualCouplingLayer(nn.Module):
6
+ def __init__(self, spec_channels, inter_channels, hidden_channels, kernel_size, enc_dilation_rate, n_layers, p_dropout):
7
+ super().__init__()
8
+ self.channels = channels
9
+ self.hidden_channels = hidden_channels
10
+ self.kernel_size = kernel_size
11
+ self.dilation_rate = dilation_rate
12
+ self.n_layers = n_layers
13
+
14
+ self.pre = nn.Conv1d(channels // 2, hidden_channels, 1)
15
+ self.convs = nn.ModuleList()
16
+ for i in range(n_layers):
17
+ dilation = dilation_rate ** i
18
+ self.convs.append(
19
+ nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=(kernel_size-1)//2 * dilation, dilation=dilation)
20
+ )
21
+ self.post = nn.Conv1d(hidden_channels, channels, 1)
22
+
23
+ def forward(self, x, reverse=False):
24
+ x0, x1 = torch.chunk(x, 2, dim=1)
25
+ h = self.pre(x0)
26
+ for conv in self.convs:
27
+ h = F.relu(conv(h))
28
+ h = self.post(h)
29
+ m, logs = torch.chunk(h, 2, dim=1)
30
+
31
+ if not reverse:
32
+ x1 = m + x1 * torch.exp(logs)
33
+ else:
34
+ x1 = (x1 - m) * torch.exp(-logs)
35
+
36
+ return torch.cat([x0, x1], dim=1)
37
+
38
+ class ResidualCouplingBlock(nn.Module):
39
+ def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows):
40
+ super().__init__()
41
+ self.flows = nn.ModuleList()
42
+ for _ in range(n_flows):
43
+ self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers))
44
+ self.flows.append(Flip())
45
+
46
+ def forward(self, x, reverse=False):
47
+ if not reverse:
48
+ for flow in self.flows:
49
+ x = flow(x)
50
+ else:
51
+ for flow in reversed(self.flows):
52
+ x = flow(x, reverse=True)
53
+ return x
54
+
55
+ class Flip(nn.Module):
56
+ def __init__(self):
57
+ super().__init__()
58
+
59
+ def forward(self, x, reverse=False):
60
+ return x.flip(1)
61
+
62
+ class PosteriorEncoder(nn.Module):
63
+ def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers):
64
+ super().__init__()
65
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
66
+ self.convs = nn.ModuleList()
67
+ for i in range(n_layers):
68
+ dilation = dilation_rate ** i
69
+ self.convs.append(
70
+ nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=(kernel_size-1)//2 * dilation, dilation=dilation)
71
+ )
72
+ self.proj_mean = nn.Conv1d(hidden_channels, out_channels, 1)
73
+ self.proj_logvar = nn.Conv1d(hidden_channels, out_channels, 1)
74
+
75
+ def forward(self, x, x_lengths):
76
+ x = self.pre(x)
77
+ for conv in self.convs:
78
+ x = F.relu(conv(x))
79
+ m = self.proj_mean(x)
80
+ logs = self.proj_logvar(x)
81
+ z = m + torch.randn_like(m) * torch.exp(logs)
82
+ return z, m, logs
83
+
84
+ def infer(self, z, z_lengths):
85
+ return z
src/sbv2/stochastic_duration_predictor.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ class ResidualCouplingLayer(nn.Module):
6
+ def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers):
7
+ super().__init__()
8
+ self.channels = channels
9
+ self.hidden_channels = hidden_channels
10
+ self.kernel_size = kernel_size
11
+ self.dilation_rate = dilation_rate
12
+ self.n_layers = n_layers
13
+
14
+ self.pre = nn.Conv1d(channels // 2, hidden_channels, 1)
15
+ self.convs = nn.ModuleList()
16
+ for i in range(n_layers):
17
+ dilation = dilation_rate ** i
18
+ self.convs.append(
19
+ nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=(kernel_size-1)//2 * dilation, dilation=dilation)
20
+ )
21
+ self.post = nn.Conv1d(hidden_channels, channels, 1)
22
+
23
+ def forward(self, x, reverse=False):
24
+ x0, x1 = torch.chunk(x, 2, dim=1)
25
+ h = self.pre(x0)
26
+ for conv in self.convs:
27
+ h = F.relu(conv(h))
28
+ h = self.post(h)
29
+ m, logs = torch.chunk(h, 2, dim=1)
30
+
31
+ if not reverse:
32
+ x1 = m + x1 * torch.exp(logs)
33
+ else:
34
+ x1 = (x1 - m) * torch.exp(-logs)
35
+
36
+ return torch.cat([x0, x1], dim=1)
37
+
38
+ class ResidualCouplingBlock(nn.Module):
39
+ def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows):
40
+ super().__init__()
41
+ self.flows = nn.ModuleList()
42
+ for _ in range(n_flows):
43
+ self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers))
44
+ self.flows.append(Flip())
45
+
46
+ def forward(self, x, reverse=False):
47
+ if not reverse:
48
+ for flow in self.flows:
49
+ x = flow(x)
50
+ else:
51
+ for flow in reversed(self.flows):
52
+ x = flow(x, reverse=True)
53
+ return x
54
+
55
+ class Flip(nn.Module):
56
+ def __init__(self):
57
+ super().__init__()
58
+
59
+ def forward(self, x, reverse=False):
60
+ return x.flip(1)
61
+
62
+ class PosteriorEncoder(nn.Module):
63
+ def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers):
64
+ super().__init__()
65
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
66
+ self.convs = nn.ModuleList()
67
+ for i in range(n_layers):
68
+ dilation = dilation_rate ** i
69
+ self.convs.append(
70
+ nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=(kernel_size-1)//2 * dilation, dilation=dilation)
71
+ )
72
+ self.proj_mean = nn.Conv1d(hidden_channels, out_channels, 1)
73
+ self.proj_logvar = nn.Conv1d(hidden_channels, out_channels, 1)
74
+
75
+ def forward(self, x, x_lengths):
76
+ x = self.pre(x)
77
+ for conv in self.convs:
78
+ x = F.relu(conv(x))
79
+ m = self.proj_mean(x)
80
+ logs = self.proj_logvar(x)
81
+ z = m + torch.randn_like(m) * torch.exp(logs)
82
+ return z, m, logs
83
+
84
+ def infer(self, z, z_lengths):
85
+ return z
86
+
87
+ class DurationPredictor(nn.Module):
88
+ def __init__(self, in_channels, filter_channels, kernel_size, p_dropout):
89
+ super().__init__()
90
+ self.conv1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=(kernel_size-1)//2)
91
+ self.conv2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=(kernel_size-1)//2)
92
+ self.proj = nn.Conv1d(filter_channels, 1, 1)
93
+ self.dropout = nn.Dropout(p_dropout)
94
+
95
+ def forward(self, x, x_mask):
96
+ x = self.conv1(x)
97
+ x = torch.relu(x)
98
+ x = self.dropout(x)
99
+ x = self.conv2(x)
100
+ x = torch.relu(x)
101
+ x = self.dropout(x)
102
+ x = self.proj(x)
103
+ x = x * x_mask
104
+ return x.squeeze(1)
105
+
106
+ class StochasticDurationPredictor(nn.Module):
107
+ def __init__(self, channels, filter_channels, kernel_size, n_flows, p_dropout):
108
+ super().__init__()
109
+ self.pre = nn.Conv1d(channels, filter_channels, 1)
110
+ self.flows = ResidualCouplingBlock(filter_channels, filter_channels, kernel_size, 1, 4, n_flows)
111
+ self.post = nn.Conv1d(filter_channels, 1, 1)
112
+ self.dropout = nn.Dropout(p_dropout)
113
+
114
+ def forward(self, x, x_mask, durations):
115
+ x = self.pre(x)
116
+ x = F.relu(x)
117
+ x = self.dropout(x)
118
+ noise = torch.randn_like(x)
119
+ x = self.flows(noise, reverse=False)
120
+ out = self.post(x)
121
+ out = out.squeeze(1) * x_mask.squeeze(1)
122
+ loss = F.mse_loss(out, durations, reduction="none")
123
+ return loss.mean()
124
+
125
+ def infer(self, x, x_mask):
126
+ x = self.pre(x)
127
+ x = F.relu(x)
128
+ x = self.dropout(x)
129
+ x = self.flows(x, reverse=True)
130
+ out = self.post(x)
131
+ out = out.squeeze(1) * x_mask.squeeze(1)
132
+ return out
src/sbv2/synthesizer_trn.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 完全新規構成 synthesize_trn.py
2
+ # (config.jsonにstrict対応)
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+
7
+ from src.sbv2.generator import Generator
8
+ from src.sbv2.posterior_encoder import PosteriorEncoder
9
+ from src.sbv2.flow import ResidualCouplingBlock
10
+ from src.sbv2.flow import Flip
11
+ from src.sbv2.duration_predictor import DurationPredictor
12
+ from src.sbv2.stochastic_duration_predictor import StochasticDurationPredictor
13
+
14
+ class SynthesizerTrn(nn.Module):
15
+ def __init__(self,
16
+ n_vocab,
17
+ spec_channels,
18
+ inter_channels,
19
+ hidden_channels,
20
+ filter_channels,
21
+ n_heads,
22
+ n_layers,
23
+ kernel_size,
24
+ p_dropout,
25
+ resblock,
26
+ resblock_kernel_sizes,
27
+ resblock_dilation_sizes,
28
+ upsample_rates,
29
+ upsample_initial_channel,
30
+ upsample_kernel_sizes,
31
+ segment_size,
32
+ gin_channels,
33
+ out_channels,
34
+ dec_kernel_size,
35
+ enc_channels,
36
+ enc_out_channels,
37
+ enc_kernel_size,
38
+ enc_dilation_rate,
39
+ enc_n_layers,
40
+ flow_hidden_channels,
41
+ flow_kernel_size,
42
+ flow_n_layers,
43
+ flow_n_flows,
44
+ sdp_hidden_channels,
45
+ sdp_kernel_size,
46
+ sdp_n_layers,
47
+ sdp_dropout,
48
+ sampling_rate,
49
+ filter_length,
50
+ hop_length,
51
+ win_length):
52
+ super().__init__()
53
+
54
+ self.n_vocab = n_vocab
55
+ self.spec_channels = spec_channels
56
+ self.inter_channels = inter_channels
57
+ self.hidden_channels = hidden_channels
58
+ self.filter_channels = filter_channels
59
+ self.n_heads = n_heads
60
+ self.n_layers = n_layers
61
+ self.kernel_size = kernel_size
62
+ self.p_dropout = p_dropout
63
+ self.resblock = resblock
64
+ self.resblock_kernel_sizes = resblock_kernel_sizes
65
+ self.resblock_dilation_sizes = resblock_dilation_sizes
66
+ self.upsample_rates = upsample_rates
67
+ self.upsample_initial_channel = upsample_initial_channel
68
+ self.upsample_kernel_sizes = upsample_kernel_sizes
69
+ self.segment_size = segment_size
70
+ self.gin_channels = gin_channels
71
+ self.out_channels = out_channels
72
+ self.dec_kernel_size = dec_kernel_size
73
+ self.enc_channels = enc_channels
74
+ self.enc_out_channels = enc_out_channels
75
+ self.enc_kernel_size = enc_kernel_size
76
+ self.enc_dilation_rate = enc_dilation_rate
77
+ self.enc_n_layers = enc_n_layers
78
+ self.flow_hidden_channels = flow_hidden_channels
79
+ self.flow_kernel_size = flow_kernel_size
80
+ self.flow_n_layers = flow_n_layers
81
+ self.flow_n_flows = flow_n_flows
82
+ self.sdp_hidden_channels = sdp_hidden_channels
83
+ self.sdp_kernel_size = sdp_kernel_size
84
+ self.sdp_n_layers = sdp_n_layers
85
+ self.sdp_dropout = sdp_dropout
86
+ self.sampling_rate = sampling_rate
87
+ self.filter_length = filter_length
88
+ self.hop_length = hop_length
89
+ self.win_length = win_length
90
+
91
+ # ネットワークモジュール
92
+ self.enc_p = PosteriorEncoder(
93
+ spec_channels, inter_channels, hidden_channels,
94
+ kernel_size, enc_dilation_rate, int(enc_n_layers))
95
+ self.decoder = Generator(
96
+ upsample_rates, upsample_initial_channel)
97
+ self.flow = ResidualCouplingBlock(
98
+ inter_channels, flow_hidden_channels, flow_kernel_size, flow_n_layers)
99
+ self.flow_post = Flip()
100
+ self.dp = DurationPredictor(
101
+ inter_channels, filter_channels, kernel_size, p_dropout)
102
+ self.sdp = StochasticDurationPredictor(
103
+ inter_channels, filter_channels, kernel_size, p_dropout)
104
+
105
+ def forward(self, *args, **kwargs):
106
+ raise NotImplementedError("Training用 forwardは未実装です")
107
+
108
+ def infer(self, *args, **kwargs):
109
+ raise NotImplementedError("推論用 inferは未実装です")
src/sbv2/text/cleaners.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import unicodedata
3
+
4
+ _japanese_replacements = [
5
+ ("‐", "-"),
6
+ ("―", "-"),
7
+ ("−", "-"),
8
+ ("ー", "ー"),
9
+ ("・", "、"),
10
+ (",", "、"),
11
+ (",", "、"),
12
+ (".", "。"),
13
+ (".", "。"),
14
+ ]
15
+
16
+ def japanese_cleaners(text):
17
+ text = unicodedata.normalize("NFKC", text)
18
+ for pattern, replacement in _japanese_replacements:
19
+ text = text.replace(pattern, replacement)
20
+ text = re.sub(r"[()!?!?\[\]{}]", "", text) # 特殊記号除去
21
+ return text
22
+
23
+ # ⭐ ここを追加する!
24
+ basic_cleaners = japanese_cleaners
src/sbv2/text/symbols.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ _pad = "_"
2
+ _punctuation = ",.!?-…~"
3
+ _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
4
+ _japanese = "あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもやゆよらりるれろわをんアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヲンー"
5
+ _extra = "0123456789"
6
+
7
+ symbols = list(_pad + _punctuation + _letters + _japanese + _extra)
src/sbv2/text/text_to_sequence.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # text_to_sequence.py
2
+
3
+ from src.sbv2.text_encoder import text_to_sequence as encoder_text_to_sequence
4
+ from src.sbv2.text.cleaners import basic_cleaners
5
+
6
+ def text_to_sequence(text):
7
+ text = basic_cleaners(text)
8
+ return encoder_text_to_sequence(text)
src/sbv2/text_encoder.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # text_encoder.py
2
+
3
+ from src.sbv2.text.symbols import symbols
4
+
5
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
6
+ _id_to_symbol = {i: s for i, s in enumerate(symbols)}
7
+
8
+ def text_to_sequence(text):
9
+ sequence = [_symbol_to_id[s] for s in text if s in _symbol_to_id]
10
+ return sequence
11
+
12
+ def sequence_to_text(sequence):
13
+ return ''.join([_id_to_symbol[i] for i in sequence if i in _id_to_symbol])
14
+