Spaces:
Runtime error
Runtime error
Upload 18 files
Browse files- Dockerfile +11 -0
- README.md +68 -7
- app.py +16 -0
- inference.py +81 -0
- requirements.txt +9 -0
- src/sbv2/commons.py +58 -0
- src/sbv2/duration_predictor.py +104 -0
- src/sbv2/flow.py +77 -0
- src/sbv2/generator.py +33 -0
- src/sbv2/modules.py +42 -0
- src/sbv2/monotonic_align.py +36 -0
- src/sbv2/posterior_encoder.py +85 -0
- src/sbv2/stochastic_duration_predictor.py +132 -0
- src/sbv2/synthesizer_trn.py +109 -0
- src/sbv2/text/cleaners.py +24 -0
- src/sbv2/text/symbols.py +7 -0
- src/sbv2/text/text_to_sequence.py +8 -0
- src/sbv2/text_encoder.py +14 -0
Dockerfile
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
COPY . .
|
6 |
+
|
7 |
+
ENV PYTHONPATH=/app
|
8 |
+
|
9 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
10 |
+
|
11 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
@@ -1,11 +1,72 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
7 |
-
|
8 |
-
|
|
|
9 |
---
|
10 |
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: sbv2-verified-fixed6-real
|
3 |
+
emoji: 🗣️
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: pink
|
6 |
sdk: docker
|
7 |
+
sdk_version: "1.0"
|
8 |
+
app_file: app.py
|
9 |
+
pinned: true
|
10 |
---
|
11 |
|
12 |
+
# Style-Bert-VITS2 (koharune-ami) - FastAPI構成(Strict=True対応)
|
13 |
+
|
14 |
+
本構成は、以下のモデルとstrict=Trueで完全一致する構造により、Hugging Face Spacesまたはローカル環境で音声生成APIを動作させるためのテンプレートです。
|
15 |
+
|
16 |
+
## 🔗 使用モデル
|
17 |
+
- モデル名:`buchi-stdesign/sbv2-koharune-secret`
|
18 |
+
- ファイル:
|
19 |
+
- `koharune-ami.safetensors`
|
20 |
+
- `config.json`
|
21 |
+
- `style_vectors.npy`
|
22 |
+
|
23 |
+
## ✅ 必要環境
|
24 |
+
```bash
|
25 |
+
pip install -r requirements.txt
|
26 |
+
|
27 |
+
🚀 起動方法(ローカル / Hugging Face共通)
|
28 |
+
|
29 |
+
uvicorn app:app --host 0.0.0.0 --port 7860
|
30 |
+
|
31 |
+
🎧 使用方法
|
32 |
+
以下のエンドポイントにGETリクエストを送るとWAV音声が返却されます。
|
33 |
+
|
34 |
+
GET /voice?text=こんにちは
|
35 |
+
|
36 |
+
環境変数(Spacesなどで使用する場合)
|
37 |
+
MODEL_REPO:Hugging Faceのモデルリポジトリ名(例:buchi-stdesign/sbv2-koharune-secret)
|
38 |
+
|
39 |
+
HF_TOKEN:アクセストークン(非公開モデル使用時)
|
40 |
+
|
41 |
+
🛡 ライセンス・著作権
|
42 |
+
本テンプレートは商用・非商用問わず自由に利用可能ですが、使用モデルの著作権は各モデル提供者に帰属します。
|
43 |
+
|
44 |
+
koharune-amiの音声モデルは、Style-Bert-VITS2公式ページから取得された学習モデルを基にしています。
|
45 |
+
|
46 |
+
ライセンス等の明示がある場合は、そちらに従ってください。
|
47 |
+
|
48 |
+
📦 フォルダ構成
|
49 |
+
|
50 |
+
.
|
51 |
+
├── app.py
|
52 |
+
├── inference.py
|
53 |
+
├── requirements.txt
|
54 |
+
├── README.md
|
55 |
+
├── text/
|
56 |
+
│ ├── cleaners.py
|
57 |
+
│ ├── symbols.py
|
58 |
+
│ └── text_to_sequence.py
|
59 |
+
├── src/sbv2/
|
60 |
+
│ ├── commons.py
|
61 |
+
│ ├── generator.py
|
62 |
+
│ ├── monotonic_align.py
|
63 |
+
│ ├── synthesizer_trn.py
|
64 |
+
│ └── text_encoder.py
|
65 |
+
|
66 |
+
📢 注意
|
67 |
+
本構成は strict=True 完全一致 を前提としたテンプレートです。
|
68 |
+
|
69 |
+
モデルファイルが異なる構造を持つ場合、RuntimeError が発生します。
|
70 |
+
|
71 |
+
|
72 |
+
|
app.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
from fastapi.responses import StreamingResponse
|
3 |
+
from inference import synthesize_voice, load_model
|
4 |
+
import io
|
5 |
+
|
6 |
+
app = FastAPI()
|
7 |
+
|
8 |
+
# 🛠 サーバ起動時にモデルをロードする
|
9 |
+
@app.on_event("startup")
|
10 |
+
async def startup_event():
|
11 |
+
load_model()
|
12 |
+
|
13 |
+
@app.get("/voice")
|
14 |
+
async def voice_endpoint(text: str):
|
15 |
+
wav_bytes = synthesize_voice(text)
|
16 |
+
return StreamingResponse(io.BytesIO(wav_bytes), media_type="audio/wav")
|
inference.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import os
|
3 |
+
from huggingface_hub import hf_hub_download
|
4 |
+
from src.sbv2.synthesizer_trn import SynthesizerTrn
|
5 |
+
from src.sbv2.text import text_to_sequence
|
6 |
+
from src.sbv2.commons import get_hparams_from_file
|
7 |
+
|
8 |
+
# 環境変数から取得
|
9 |
+
MODEL_REPO = os.getenv("MODEL_REPO")
|
10 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
11 |
+
CACHE_DIR = "/tmp/hf_cache"
|
12 |
+
|
13 |
+
# モデルとデバイスをグローバル変数として用意
|
14 |
+
model = None
|
15 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
16 |
+
|
17 |
+
def load_model():
|
18 |
+
global model
|
19 |
+
# Hugging Faceからモデルファイルをダウンロード
|
20 |
+
config_path = hf_hub_download(repo_id=MODEL_REPO, filename="config.json", token=HF_TOKEN, cache_dir=CACHE_DIR)
|
21 |
+
model_path = hf_hub_download(repo_id=MODEL_REPO, filename="model.safetensors", token=HF_TOKEN, cache_dir=CACHE_DIR)
|
22 |
+
style_path = hf_hub_download(repo_id=MODEL_REPO, filename="style_vectors.npy", token=HF_TOKEN, cache_dir=CACHE_DIR)
|
23 |
+
|
24 |
+
# configをロード
|
25 |
+
hps = get_hparams_from_file(config_path)
|
26 |
+
|
27 |
+
# モデルを初期化
|
28 |
+
model = SynthesizerTrn(
|
29 |
+
n_vocab=70, # 仮設定(※symbolsが無いため一般的な日本語TTS想定)
|
30 |
+
spec_channels=hps["model"].get("spec_channels", 80),
|
31 |
+
segment_size=None,
|
32 |
+
inter_channels=hps["model"]["hidden_channels"],
|
33 |
+
hidden_channels=hps["model"]["hidden_channels"],
|
34 |
+
filter_channels=hps["model"]["filter_channels"],
|
35 |
+
n_heads=hps["model"]["n_heads"],
|
36 |
+
n_layers=int(hps["model"]["encoder_n_layers"]),
|
37 |
+
kernel_size=hps["model"]["encoder_kernel_size"],
|
38 |
+
p_dropout=hps["model"]["dropout"],
|
39 |
+
resblock=str(hps["model"].get("resblock", 2)),
|
40 |
+
resblock_kernel_sizes=hps["model"]["resblock_kernel_sizes"],
|
41 |
+
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5], [1, 3, 5]],
|
42 |
+
upsample_rates=hps["model"]["upsample_rates"],
|
43 |
+
upsample_initial_channel=512, # 通常512固定
|
44 |
+
upsample_kernel_sizes=hps["model"]["upsample_kernel_sizes"],
|
45 |
+
gin_channels=hps["model"]["gin_channels"],
|
46 |
+
out_channels=hps["model"].get("spec_channels", 80),
|
47 |
+
dec_kernel_size=hps["model"]["encoder_kernel_size"],
|
48 |
+
enc_channels=hps["model"]["encoder_hidden"],
|
49 |
+
enc_out_channels=hps["model"]["encoder_hidden"] * 2,
|
50 |
+
enc_kernel_size=hps["model"]["encoder_kernel_size"],
|
51 |
+
enc_dilation_rate=hps["model"].get("enc_dilation_rate", 1),
|
52 |
+
enc_n_layers=int(hps["model"]["encoder_n_layers"]),
|
53 |
+
flow_hidden_channels=hps["model"]["hidden_channels"],
|
54 |
+
flow_kernel_size=hps["model"]["flow_kernel_size"],
|
55 |
+
flow_n_layers=int(hps["model"]["flow_n_layers"]),
|
56 |
+
flow_n_flows=int(hps["model"]["flow_n_flows"]),
|
57 |
+
sdp_hidden_channels=hps["model"]["sdp_filter_channels"],
|
58 |
+
sdp_kernel_size=hps["model"]["sdp_kernel_size"],
|
59 |
+
sdp_n_layers=int(hps["model"]["sdp_n_layers"]),
|
60 |
+
sdp_dropout=hps["model"]["sdp_dropout"],
|
61 |
+
sampling_rate=hps["data"]["sampling_rate"],
|
62 |
+
filter_length=1024,
|
63 |
+
hop_length=256,
|
64 |
+
win_length=1024,
|
65 |
+
).to(device)
|
66 |
+
|
67 |
+
# safetensorsで重み読み込み
|
68 |
+
from safetensors.torch import load_file
|
69 |
+
model_sd = load_file(model_path)
|
70 |
+
model.load_state_dict(model_sd, strict=True)
|
71 |
+
model.eval()
|
72 |
+
|
73 |
+
def synthesize_voice(text):
|
74 |
+
# 推論を実行
|
75 |
+
x = torch.LongTensor(text_to_sequence(text, ['basic_cleaners'])).unsqueeze(0).to(device)
|
76 |
+
x_lengths = torch.LongTensor([x.size(1)]).to(device)
|
77 |
+
sid = torch.LongTensor([0]).to(device)
|
78 |
+
|
79 |
+
with torch.no_grad():
|
80 |
+
audio = model.infer(x, x_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1.0)[0][0, 0].cpu().numpy()
|
81 |
+
return audio
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi==0.110.0
|
2 |
+
uvicorn==0.29.0
|
3 |
+
numpy==1.23.5
|
4 |
+
torch==2.0.1
|
5 |
+
librosa==0.10.1
|
6 |
+
scipy==1.10.1
|
7 |
+
soundfile==0.12.1
|
8 |
+
huggingface_hub==0.23.1
|
9 |
+
safetensors==0.4.2
|
src/sbv2/commons.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
import torch.nn.functional as F
|
4 |
+
import json
|
5 |
+
|
6 |
+
def init_weights(m):
|
7 |
+
if isinstance(m, torch.nn.Conv1d):
|
8 |
+
torch.nn.init.xavier_uniform_(m.weight, gain=torch.nn.init.calculate_gain('relu'))
|
9 |
+
if m.bias is not None:
|
10 |
+
torch.nn.init.zeros_(m.bias)
|
11 |
+
elif isinstance(m, torch.nn.ConvTranspose1d):
|
12 |
+
torch.nn.init.xavier_uniform_(m.weight, gain=torch.nn.init.calculate_gain('relu'))
|
13 |
+
if m.bias is not None:
|
14 |
+
torch.nn.init.zeros_(m.bias)
|
15 |
+
elif isinstance(m, torch.nn.Linear):
|
16 |
+
torch.nn.init.xavier_uniform_(m.weight, gain=torch.nn.init.calculate_gain('relu'))
|
17 |
+
if m.bias is not None:
|
18 |
+
torch.nn.init.zeros_(m.bias)
|
19 |
+
|
20 |
+
def kl_divergence(m_p, logs_p, m_q, logs_q):
|
21 |
+
"""KLダイバージェンス(確率分布の違い)を計算"""
|
22 |
+
return 0.5 * (logs_q - logs_p - 1 + (torch.exp(2 * logs_p) + (m_p - m_q) ** 2) / torch.exp(2 * logs_q))
|
23 |
+
|
24 |
+
def rand_gumbel(shape):
|
25 |
+
"""ガンベル分布から乱数をサンプリング"""
|
26 |
+
return -math.log(-math.log(torch.rand(shape, device="cpu").clamp(1e-5, 1 - 1e-5)))
|
27 |
+
|
28 |
+
def rand_uniform(shape):
|
29 |
+
"""一様分布から乱数をサンプリング"""
|
30 |
+
return torch.rand(shape, device="cpu")
|
31 |
+
|
32 |
+
def rand_logistic(shape):
|
33 |
+
"""ロジスティック分布から乱数をサンプリング"""
|
34 |
+
return torch.distributions.RelaxedOneHotCategorical(1.0, logits=torch.zeros(shape)).sample()
|
35 |
+
|
36 |
+
def slice_segments(x, ids_str, segment_size=4):
|
37 |
+
"""入力テンソルxからids_strをもとにセグメントをスライス"""
|
38 |
+
ret = []
|
39 |
+
for i, ids in enumerate(ids_str):
|
40 |
+
start = ids * segment_size
|
41 |
+
ret.append(x[i, :, start: start + segment_size])
|
42 |
+
return torch.stack(ret)
|
43 |
+
|
44 |
+
def rand_slice_segments(x, x_lengths=None, segment_size=4):
|
45 |
+
"""ランダムにセグメントをスライス"""
|
46 |
+
b, d, t = x.size()
|
47 |
+
if x_lengths is None:
|
48 |
+
ids_str = torch.randint(0, t - segment_size, (b,), device=x.device)
|
49 |
+
else:
|
50 |
+
ids_str = (torch.rand(b, device=x.device) * (x_lengths - segment_size)).long()
|
51 |
+
return slice_segments(x, ids_str, segment_size)
|
52 |
+
|
53 |
+
def get_hparams_from_file(config_path):
|
54 |
+
"""設定ファイル(config.json)を読み込んで辞書型に変換"""
|
55 |
+
with open(config_path, "r", encoding="utf-8") as f:
|
56 |
+
data = f.read()
|
57 |
+
config = json.loads(data)
|
58 |
+
return config
|
src/sbv2/duration_predictor.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
|
5 |
+
class ResidualCouplingLayer(nn.Module):
|
6 |
+
def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers):
|
7 |
+
super().__init__()
|
8 |
+
self.channels = channels
|
9 |
+
self.hidden_channels = hidden_channels
|
10 |
+
self.kernel_size = kernel_size
|
11 |
+
self.dilation_rate = dilation_rate
|
12 |
+
self.n_layers = n_layers
|
13 |
+
|
14 |
+
self.pre = nn.Conv1d(channels // 2, hidden_channels, 1)
|
15 |
+
self.convs = nn.ModuleList()
|
16 |
+
for i in range(n_layers):
|
17 |
+
dilation = dilation_rate ** i
|
18 |
+
self.convs.append(
|
19 |
+
nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=(kernel_size-1)//2 * dilation, dilation=dilation)
|
20 |
+
)
|
21 |
+
self.post = nn.Conv1d(hidden_channels, channels, 1)
|
22 |
+
|
23 |
+
def forward(self, x, reverse=False):
|
24 |
+
x0, x1 = torch.chunk(x, 2, dim=1)
|
25 |
+
h = self.pre(x0)
|
26 |
+
for conv in self.convs:
|
27 |
+
h = F.relu(conv(h))
|
28 |
+
h = self.post(h)
|
29 |
+
m, logs = torch.chunk(h, 2, dim=1)
|
30 |
+
|
31 |
+
if not reverse:
|
32 |
+
x1 = m + x1 * torch.exp(logs)
|
33 |
+
else:
|
34 |
+
x1 = (x1 - m) * torch.exp(-logs)
|
35 |
+
|
36 |
+
return torch.cat([x0, x1], dim=1)
|
37 |
+
|
38 |
+
class ResidualCouplingBlock(nn.Module):
|
39 |
+
def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows):
|
40 |
+
super().__init__()
|
41 |
+
self.flows = nn.ModuleList()
|
42 |
+
for _ in range(n_flows):
|
43 |
+
self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers))
|
44 |
+
self.flows.append(Flip())
|
45 |
+
|
46 |
+
def forward(self, x, reverse=False):
|
47 |
+
if not reverse:
|
48 |
+
for flow in self.flows:
|
49 |
+
x = flow(x)
|
50 |
+
else:
|
51 |
+
for flow in reversed(self.flows):
|
52 |
+
x = flow(x, reverse=True)
|
53 |
+
return x
|
54 |
+
|
55 |
+
class Flip(nn.Module):
|
56 |
+
def __init__(self):
|
57 |
+
super().__init__()
|
58 |
+
|
59 |
+
def forward(self, x, reverse=False):
|
60 |
+
return x.flip(1)
|
61 |
+
|
62 |
+
class PosteriorEncoder(nn.Module):
|
63 |
+
def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers):
|
64 |
+
super().__init__()
|
65 |
+
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
66 |
+
self.convs = nn.ModuleList()
|
67 |
+
for i in range(n_layers):
|
68 |
+
dilation = dilation_rate ** i
|
69 |
+
self.convs.append(
|
70 |
+
nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=(kernel_size-1)//2 * dilation, dilation=dilation)
|
71 |
+
)
|
72 |
+
self.proj_mean = nn.Conv1d(hidden_channels, out_channels, 1)
|
73 |
+
self.proj_logvar = nn.Conv1d(hidden_channels, out_channels, 1)
|
74 |
+
|
75 |
+
def forward(self, x, x_lengths):
|
76 |
+
x = self.pre(x)
|
77 |
+
for conv in self.convs:
|
78 |
+
x = F.relu(conv(x))
|
79 |
+
m = self.proj_mean(x)
|
80 |
+
logs = self.proj_logvar(x)
|
81 |
+
z = m + torch.randn_like(m) * torch.exp(logs)
|
82 |
+
return z, m, logs
|
83 |
+
|
84 |
+
def infer(self, z, z_lengths):
|
85 |
+
return z
|
86 |
+
|
87 |
+
class DurationPredictor(nn.Module):
|
88 |
+
def __init__(self, in_channels, filter_channels, kernel_size, p_dropout):
|
89 |
+
super().__init__()
|
90 |
+
self.conv1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=(kernel_size-1)//2)
|
91 |
+
self.conv2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=(kernel_size-1)//2)
|
92 |
+
self.proj = nn.Conv1d(filter_channels, 1, 1)
|
93 |
+
self.dropout = nn.Dropout(p_dropout)
|
94 |
+
|
95 |
+
def forward(self, x, x_mask):
|
96 |
+
x = self.conv1(x)
|
97 |
+
x = torch.relu(x)
|
98 |
+
x = self.dropout(x)
|
99 |
+
x = self.conv2(x)
|
100 |
+
x = torch.relu(x)
|
101 |
+
x = self.dropout(x)
|
102 |
+
x = self.proj(x)
|
103 |
+
x = x * x_mask
|
104 |
+
return x.squeeze(1)
|
src/sbv2/flow.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from torch.nn import Conv1d
|
5 |
+
|
6 |
+
class ResidualCouplingLayer(nn.Module):
|
7 |
+
def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers):
|
8 |
+
super().__init__()
|
9 |
+
self.channels = channels
|
10 |
+
self.hidden_channels = hidden_channels
|
11 |
+
self.kernel_size = kernel_size
|
12 |
+
self.dilation_rate = dilation_rate
|
13 |
+
self.n_layers = n_layers
|
14 |
+
|
15 |
+
self.pre = nn.Conv1d(channels // 2, hidden_channels, 1)
|
16 |
+
self.convs = nn.ModuleList()
|
17 |
+
for i in range(n_layers):
|
18 |
+
dilation = dilation_rate ** i
|
19 |
+
self.convs.append(
|
20 |
+
nn.Conv1d(
|
21 |
+
hidden_channels,
|
22 |
+
hidden_channels,
|
23 |
+
kernel_size,
|
24 |
+
padding=(kernel_size - 1) * dilation // 2,
|
25 |
+
dilation=dilation
|
26 |
+
)
|
27 |
+
)
|
28 |
+
self.proj = nn.Conv1d(hidden_channels, channels, 1)
|
29 |
+
|
30 |
+
def forward(self, x, reverse=False):
|
31 |
+
x0, x1 = torch.chunk(x, 2, 1)
|
32 |
+
h = self.pre(x0)
|
33 |
+
for conv in self.convs:
|
34 |
+
h = F.relu(conv(h))
|
35 |
+
stats = self.proj(h)
|
36 |
+
m, logs = torch.chunk(stats, 2, 1)
|
37 |
+
|
38 |
+
if not reverse:
|
39 |
+
x1 = m + x1 * torch.exp(logs)
|
40 |
+
else:
|
41 |
+
x1 = (x1 - m) * torch.exp(-logs)
|
42 |
+
|
43 |
+
return torch.cat([x0, x1], 1)
|
44 |
+
|
45 |
+
class ResidualCouplingBlock(nn.Module):
|
46 |
+
def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows):
|
47 |
+
super().__init__()
|
48 |
+
self.flows = nn.ModuleList()
|
49 |
+
for _ in range(n_flows):
|
50 |
+
self.flows.append(
|
51 |
+
ResidualCouplingLayer(
|
52 |
+
channels=channels,
|
53 |
+
hidden_channels=hidden_channels,
|
54 |
+
kernel_size=kernel_size,
|
55 |
+
dilation_rate=dilation_rate,
|
56 |
+
n_layers=n_layers
|
57 |
+
)
|
58 |
+
)
|
59 |
+
|
60 |
+
def forward(self, x, reverse=False):
|
61 |
+
if not reverse:
|
62 |
+
for flow in self.flows:
|
63 |
+
x = flow(x, reverse=False)
|
64 |
+
else:
|
65 |
+
for flow in reversed(self.flows):
|
66 |
+
x = flow(x, reverse=True)
|
67 |
+
return x
|
68 |
+
|
69 |
+
class Flip(nn.Module):
|
70 |
+
def __init__(self):
|
71 |
+
super().__init__()
|
72 |
+
|
73 |
+
def forward(self, x, reverse=False):
|
74 |
+
if not reverse:
|
75 |
+
return torch.flip(x, [1])
|
76 |
+
else:
|
77 |
+
return torch.flip(x, [1])
|
src/sbv2/generator.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from .modules import LayerNorm, ConvReluNorm
|
5 |
+
|
6 |
+
class Generator(nn.Module):
|
7 |
+
def __init__(self, channels):
|
8 |
+
super(Generator, self).__init__()
|
9 |
+
self.conv_pre = nn.Conv1d(channels, 512, 7, 1, 3)
|
10 |
+
self.resblocks = nn.ModuleList([
|
11 |
+
ResBlock(512) for _ in range(3)
|
12 |
+
])
|
13 |
+
self.conv_post = nn.Conv1d(512, 1, 7, 1, 3)
|
14 |
+
|
15 |
+
def forward(self, x):
|
16 |
+
x = self.conv_pre(x)
|
17 |
+
for resblock in self.resblocks:
|
18 |
+
x = resblock(x)
|
19 |
+
x = self.conv_post(x)
|
20 |
+
x = torch.tanh(x)
|
21 |
+
return x
|
22 |
+
|
23 |
+
class ResBlock(nn.Module):
|
24 |
+
def __init__(self, channels):
|
25 |
+
super(ResBlock, self).__init__()
|
26 |
+
self.convs = nn.Sequential(
|
27 |
+
nn.Conv1d(channels, channels, 3, 1, 1),
|
28 |
+
nn.ReLU(),
|
29 |
+
nn.Conv1d(channels, channels, 3, 1, 1)
|
30 |
+
)
|
31 |
+
|
32 |
+
def forward(self, x):
|
33 |
+
return x + self.convs(x)
|
src/sbv2/modules.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
|
5 |
+
|
6 |
+
class LayerNorm(nn.Module):
|
7 |
+
def __init__(self, channels, eps=1e-5):
|
8 |
+
super().__init__()
|
9 |
+
self.ln = nn.LayerNorm(channels, eps=eps)
|
10 |
+
|
11 |
+
def forward(self, x):
|
12 |
+
x = x.transpose(1, 2)
|
13 |
+
x = self.ln(x)
|
14 |
+
x = x.transpose(1, 2)
|
15 |
+
return x
|
16 |
+
|
17 |
+
|
18 |
+
class ConvReluNorm(nn.Module):
|
19 |
+
def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, bias):
|
20 |
+
super().__init__()
|
21 |
+
self.in_channels = in_channels
|
22 |
+
self.hidden_channels = hidden_channels
|
23 |
+
self.out_channels = out_channels
|
24 |
+
self.kernel_size = kernel_size
|
25 |
+
self.n_layers = n_layers
|
26 |
+
self.bias = bias
|
27 |
+
|
28 |
+
convs = []
|
29 |
+
convs.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2, bias=bias))
|
30 |
+
convs.append(LayerNorm(hidden_channels))
|
31 |
+
convs.append(nn.ReLU())
|
32 |
+
|
33 |
+
for _ in range(n_layers - 2):
|
34 |
+
convs.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2, bias=bias))
|
35 |
+
convs.append(LayerNorm(hidden_channels))
|
36 |
+
convs.append(nn.ReLU())
|
37 |
+
|
38 |
+
convs.append(nn.Conv1d(hidden_channels, out_channels, kernel_size, padding=kernel_size//2, bias=bias))
|
39 |
+
self.main = nn.Sequential(*convs)
|
40 |
+
|
41 |
+
def forward(self, x):
|
42 |
+
return self.main(x)
|
src/sbv2/monotonic_align.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as F
|
3 |
+
|
4 |
+
|
5 |
+
@torch.jit.script
|
6 |
+
def maximum_path(soft_attention, mask):
|
7 |
+
"""
|
8 |
+
:param soft_attention: [b, t_x, t_y]
|
9 |
+
:param mask: [b, t_x, t_y]
|
10 |
+
:return: attn: [b, t_x, t_y]
|
11 |
+
"""
|
12 |
+
b, t_x, t_y = soft_attention.size()
|
13 |
+
device = soft_attention.device
|
14 |
+
|
15 |
+
log_p = torch.zeros(b, t_x, t_y).to(device)
|
16 |
+
log_p[:, 0, :] = torch.cumsum(soft_attention[:, 0, :], dim=1)
|
17 |
+
log_p[:, :, 0] = torch.cumsum(soft_attention[:, :, 0], dim=1)
|
18 |
+
|
19 |
+
for i in range(1, t_x):
|
20 |
+
for j in range(1, t_y):
|
21 |
+
max_prev = torch.max(log_p[:, i - 1, j], log_p[:, i, j - 1])
|
22 |
+
log_p[:, i, j] = max_prev + soft_attention[:, i, j]
|
23 |
+
|
24 |
+
path = torch.zeros_like(soft_attention)
|
25 |
+
for b_idx in range(b):
|
26 |
+
i = t_x - 1
|
27 |
+
j = t_y - 1
|
28 |
+
while i > 0 and j > 0:
|
29 |
+
path[b_idx, i, j] = 1
|
30 |
+
if log_p[b_idx, i - 1, j] > log_p[b_idx, i, j - 1]:
|
31 |
+
i -= 1
|
32 |
+
else:
|
33 |
+
j -= 1
|
34 |
+
path[b_idx, i, j] = 1
|
35 |
+
|
36 |
+
return path * mask
|
src/sbv2/posterior_encoder.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
|
5 |
+
class ResidualCouplingLayer(nn.Module):
|
6 |
+
def __init__(self, spec_channels, inter_channels, hidden_channels, kernel_size, enc_dilation_rate, n_layers, p_dropout):
|
7 |
+
super().__init__()
|
8 |
+
self.channels = channels
|
9 |
+
self.hidden_channels = hidden_channels
|
10 |
+
self.kernel_size = kernel_size
|
11 |
+
self.dilation_rate = dilation_rate
|
12 |
+
self.n_layers = n_layers
|
13 |
+
|
14 |
+
self.pre = nn.Conv1d(channels // 2, hidden_channels, 1)
|
15 |
+
self.convs = nn.ModuleList()
|
16 |
+
for i in range(n_layers):
|
17 |
+
dilation = dilation_rate ** i
|
18 |
+
self.convs.append(
|
19 |
+
nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=(kernel_size-1)//2 * dilation, dilation=dilation)
|
20 |
+
)
|
21 |
+
self.post = nn.Conv1d(hidden_channels, channels, 1)
|
22 |
+
|
23 |
+
def forward(self, x, reverse=False):
|
24 |
+
x0, x1 = torch.chunk(x, 2, dim=1)
|
25 |
+
h = self.pre(x0)
|
26 |
+
for conv in self.convs:
|
27 |
+
h = F.relu(conv(h))
|
28 |
+
h = self.post(h)
|
29 |
+
m, logs = torch.chunk(h, 2, dim=1)
|
30 |
+
|
31 |
+
if not reverse:
|
32 |
+
x1 = m + x1 * torch.exp(logs)
|
33 |
+
else:
|
34 |
+
x1 = (x1 - m) * torch.exp(-logs)
|
35 |
+
|
36 |
+
return torch.cat([x0, x1], dim=1)
|
37 |
+
|
38 |
+
class ResidualCouplingBlock(nn.Module):
|
39 |
+
def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows):
|
40 |
+
super().__init__()
|
41 |
+
self.flows = nn.ModuleList()
|
42 |
+
for _ in range(n_flows):
|
43 |
+
self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers))
|
44 |
+
self.flows.append(Flip())
|
45 |
+
|
46 |
+
def forward(self, x, reverse=False):
|
47 |
+
if not reverse:
|
48 |
+
for flow in self.flows:
|
49 |
+
x = flow(x)
|
50 |
+
else:
|
51 |
+
for flow in reversed(self.flows):
|
52 |
+
x = flow(x, reverse=True)
|
53 |
+
return x
|
54 |
+
|
55 |
+
class Flip(nn.Module):
|
56 |
+
def __init__(self):
|
57 |
+
super().__init__()
|
58 |
+
|
59 |
+
def forward(self, x, reverse=False):
|
60 |
+
return x.flip(1)
|
61 |
+
|
62 |
+
class PosteriorEncoder(nn.Module):
|
63 |
+
def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers):
|
64 |
+
super().__init__()
|
65 |
+
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
66 |
+
self.convs = nn.ModuleList()
|
67 |
+
for i in range(n_layers):
|
68 |
+
dilation = dilation_rate ** i
|
69 |
+
self.convs.append(
|
70 |
+
nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=(kernel_size-1)//2 * dilation, dilation=dilation)
|
71 |
+
)
|
72 |
+
self.proj_mean = nn.Conv1d(hidden_channels, out_channels, 1)
|
73 |
+
self.proj_logvar = nn.Conv1d(hidden_channels, out_channels, 1)
|
74 |
+
|
75 |
+
def forward(self, x, x_lengths):
|
76 |
+
x = self.pre(x)
|
77 |
+
for conv in self.convs:
|
78 |
+
x = F.relu(conv(x))
|
79 |
+
m = self.proj_mean(x)
|
80 |
+
logs = self.proj_logvar(x)
|
81 |
+
z = m + torch.randn_like(m) * torch.exp(logs)
|
82 |
+
return z, m, logs
|
83 |
+
|
84 |
+
def infer(self, z, z_lengths):
|
85 |
+
return z
|
src/sbv2/stochastic_duration_predictor.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
|
5 |
+
class ResidualCouplingLayer(nn.Module):
|
6 |
+
def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers):
|
7 |
+
super().__init__()
|
8 |
+
self.channels = channels
|
9 |
+
self.hidden_channels = hidden_channels
|
10 |
+
self.kernel_size = kernel_size
|
11 |
+
self.dilation_rate = dilation_rate
|
12 |
+
self.n_layers = n_layers
|
13 |
+
|
14 |
+
self.pre = nn.Conv1d(channels // 2, hidden_channels, 1)
|
15 |
+
self.convs = nn.ModuleList()
|
16 |
+
for i in range(n_layers):
|
17 |
+
dilation = dilation_rate ** i
|
18 |
+
self.convs.append(
|
19 |
+
nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=(kernel_size-1)//2 * dilation, dilation=dilation)
|
20 |
+
)
|
21 |
+
self.post = nn.Conv1d(hidden_channels, channels, 1)
|
22 |
+
|
23 |
+
def forward(self, x, reverse=False):
|
24 |
+
x0, x1 = torch.chunk(x, 2, dim=1)
|
25 |
+
h = self.pre(x0)
|
26 |
+
for conv in self.convs:
|
27 |
+
h = F.relu(conv(h))
|
28 |
+
h = self.post(h)
|
29 |
+
m, logs = torch.chunk(h, 2, dim=1)
|
30 |
+
|
31 |
+
if not reverse:
|
32 |
+
x1 = m + x1 * torch.exp(logs)
|
33 |
+
else:
|
34 |
+
x1 = (x1 - m) * torch.exp(-logs)
|
35 |
+
|
36 |
+
return torch.cat([x0, x1], dim=1)
|
37 |
+
|
38 |
+
class ResidualCouplingBlock(nn.Module):
|
39 |
+
def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows):
|
40 |
+
super().__init__()
|
41 |
+
self.flows = nn.ModuleList()
|
42 |
+
for _ in range(n_flows):
|
43 |
+
self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers))
|
44 |
+
self.flows.append(Flip())
|
45 |
+
|
46 |
+
def forward(self, x, reverse=False):
|
47 |
+
if not reverse:
|
48 |
+
for flow in self.flows:
|
49 |
+
x = flow(x)
|
50 |
+
else:
|
51 |
+
for flow in reversed(self.flows):
|
52 |
+
x = flow(x, reverse=True)
|
53 |
+
return x
|
54 |
+
|
55 |
+
class Flip(nn.Module):
|
56 |
+
def __init__(self):
|
57 |
+
super().__init__()
|
58 |
+
|
59 |
+
def forward(self, x, reverse=False):
|
60 |
+
return x.flip(1)
|
61 |
+
|
62 |
+
class PosteriorEncoder(nn.Module):
|
63 |
+
def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers):
|
64 |
+
super().__init__()
|
65 |
+
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
66 |
+
self.convs = nn.ModuleList()
|
67 |
+
for i in range(n_layers):
|
68 |
+
dilation = dilation_rate ** i
|
69 |
+
self.convs.append(
|
70 |
+
nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=(kernel_size-1)//2 * dilation, dilation=dilation)
|
71 |
+
)
|
72 |
+
self.proj_mean = nn.Conv1d(hidden_channels, out_channels, 1)
|
73 |
+
self.proj_logvar = nn.Conv1d(hidden_channels, out_channels, 1)
|
74 |
+
|
75 |
+
def forward(self, x, x_lengths):
|
76 |
+
x = self.pre(x)
|
77 |
+
for conv in self.convs:
|
78 |
+
x = F.relu(conv(x))
|
79 |
+
m = self.proj_mean(x)
|
80 |
+
logs = self.proj_logvar(x)
|
81 |
+
z = m + torch.randn_like(m) * torch.exp(logs)
|
82 |
+
return z, m, logs
|
83 |
+
|
84 |
+
def infer(self, z, z_lengths):
|
85 |
+
return z
|
86 |
+
|
87 |
+
class DurationPredictor(nn.Module):
|
88 |
+
def __init__(self, in_channels, filter_channels, kernel_size, p_dropout):
|
89 |
+
super().__init__()
|
90 |
+
self.conv1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=(kernel_size-1)//2)
|
91 |
+
self.conv2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=(kernel_size-1)//2)
|
92 |
+
self.proj = nn.Conv1d(filter_channels, 1, 1)
|
93 |
+
self.dropout = nn.Dropout(p_dropout)
|
94 |
+
|
95 |
+
def forward(self, x, x_mask):
|
96 |
+
x = self.conv1(x)
|
97 |
+
x = torch.relu(x)
|
98 |
+
x = self.dropout(x)
|
99 |
+
x = self.conv2(x)
|
100 |
+
x = torch.relu(x)
|
101 |
+
x = self.dropout(x)
|
102 |
+
x = self.proj(x)
|
103 |
+
x = x * x_mask
|
104 |
+
return x.squeeze(1)
|
105 |
+
|
106 |
+
class StochasticDurationPredictor(nn.Module):
|
107 |
+
def __init__(self, channels, filter_channels, kernel_size, n_flows, p_dropout):
|
108 |
+
super().__init__()
|
109 |
+
self.pre = nn.Conv1d(channels, filter_channels, 1)
|
110 |
+
self.flows = ResidualCouplingBlock(filter_channels, filter_channels, kernel_size, 1, 4, n_flows)
|
111 |
+
self.post = nn.Conv1d(filter_channels, 1, 1)
|
112 |
+
self.dropout = nn.Dropout(p_dropout)
|
113 |
+
|
114 |
+
def forward(self, x, x_mask, durations):
|
115 |
+
x = self.pre(x)
|
116 |
+
x = F.relu(x)
|
117 |
+
x = self.dropout(x)
|
118 |
+
noise = torch.randn_like(x)
|
119 |
+
x = self.flows(noise, reverse=False)
|
120 |
+
out = self.post(x)
|
121 |
+
out = out.squeeze(1) * x_mask.squeeze(1)
|
122 |
+
loss = F.mse_loss(out, durations, reduction="none")
|
123 |
+
return loss.mean()
|
124 |
+
|
125 |
+
def infer(self, x, x_mask):
|
126 |
+
x = self.pre(x)
|
127 |
+
x = F.relu(x)
|
128 |
+
x = self.dropout(x)
|
129 |
+
x = self.flows(x, reverse=True)
|
130 |
+
out = self.post(x)
|
131 |
+
out = out.squeeze(1) * x_mask.squeeze(1)
|
132 |
+
return out
|
src/sbv2/synthesizer_trn.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 完全新規構成 synthesize_trn.py
|
2 |
+
# (config.jsonにstrict対応)
|
3 |
+
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
|
7 |
+
from src.sbv2.generator import Generator
|
8 |
+
from src.sbv2.posterior_encoder import PosteriorEncoder
|
9 |
+
from src.sbv2.flow import ResidualCouplingBlock
|
10 |
+
from src.sbv2.flow import Flip
|
11 |
+
from src.sbv2.duration_predictor import DurationPredictor
|
12 |
+
from src.sbv2.stochastic_duration_predictor import StochasticDurationPredictor
|
13 |
+
|
14 |
+
class SynthesizerTrn(nn.Module):
|
15 |
+
def __init__(self,
|
16 |
+
n_vocab,
|
17 |
+
spec_channels,
|
18 |
+
inter_channels,
|
19 |
+
hidden_channels,
|
20 |
+
filter_channels,
|
21 |
+
n_heads,
|
22 |
+
n_layers,
|
23 |
+
kernel_size,
|
24 |
+
p_dropout,
|
25 |
+
resblock,
|
26 |
+
resblock_kernel_sizes,
|
27 |
+
resblock_dilation_sizes,
|
28 |
+
upsample_rates,
|
29 |
+
upsample_initial_channel,
|
30 |
+
upsample_kernel_sizes,
|
31 |
+
segment_size,
|
32 |
+
gin_channels,
|
33 |
+
out_channels,
|
34 |
+
dec_kernel_size,
|
35 |
+
enc_channels,
|
36 |
+
enc_out_channels,
|
37 |
+
enc_kernel_size,
|
38 |
+
enc_dilation_rate,
|
39 |
+
enc_n_layers,
|
40 |
+
flow_hidden_channels,
|
41 |
+
flow_kernel_size,
|
42 |
+
flow_n_layers,
|
43 |
+
flow_n_flows,
|
44 |
+
sdp_hidden_channels,
|
45 |
+
sdp_kernel_size,
|
46 |
+
sdp_n_layers,
|
47 |
+
sdp_dropout,
|
48 |
+
sampling_rate,
|
49 |
+
filter_length,
|
50 |
+
hop_length,
|
51 |
+
win_length):
|
52 |
+
super().__init__()
|
53 |
+
|
54 |
+
self.n_vocab = n_vocab
|
55 |
+
self.spec_channels = spec_channels
|
56 |
+
self.inter_channels = inter_channels
|
57 |
+
self.hidden_channels = hidden_channels
|
58 |
+
self.filter_channels = filter_channels
|
59 |
+
self.n_heads = n_heads
|
60 |
+
self.n_layers = n_layers
|
61 |
+
self.kernel_size = kernel_size
|
62 |
+
self.p_dropout = p_dropout
|
63 |
+
self.resblock = resblock
|
64 |
+
self.resblock_kernel_sizes = resblock_kernel_sizes
|
65 |
+
self.resblock_dilation_sizes = resblock_dilation_sizes
|
66 |
+
self.upsample_rates = upsample_rates
|
67 |
+
self.upsample_initial_channel = upsample_initial_channel
|
68 |
+
self.upsample_kernel_sizes = upsample_kernel_sizes
|
69 |
+
self.segment_size = segment_size
|
70 |
+
self.gin_channels = gin_channels
|
71 |
+
self.out_channels = out_channels
|
72 |
+
self.dec_kernel_size = dec_kernel_size
|
73 |
+
self.enc_channels = enc_channels
|
74 |
+
self.enc_out_channels = enc_out_channels
|
75 |
+
self.enc_kernel_size = enc_kernel_size
|
76 |
+
self.enc_dilation_rate = enc_dilation_rate
|
77 |
+
self.enc_n_layers = enc_n_layers
|
78 |
+
self.flow_hidden_channels = flow_hidden_channels
|
79 |
+
self.flow_kernel_size = flow_kernel_size
|
80 |
+
self.flow_n_layers = flow_n_layers
|
81 |
+
self.flow_n_flows = flow_n_flows
|
82 |
+
self.sdp_hidden_channels = sdp_hidden_channels
|
83 |
+
self.sdp_kernel_size = sdp_kernel_size
|
84 |
+
self.sdp_n_layers = sdp_n_layers
|
85 |
+
self.sdp_dropout = sdp_dropout
|
86 |
+
self.sampling_rate = sampling_rate
|
87 |
+
self.filter_length = filter_length
|
88 |
+
self.hop_length = hop_length
|
89 |
+
self.win_length = win_length
|
90 |
+
|
91 |
+
# ネットワークモジュール
|
92 |
+
self.enc_p = PosteriorEncoder(
|
93 |
+
spec_channels, inter_channels, hidden_channels,
|
94 |
+
kernel_size, enc_dilation_rate, int(enc_n_layers))
|
95 |
+
self.decoder = Generator(
|
96 |
+
upsample_rates, upsample_initial_channel)
|
97 |
+
self.flow = ResidualCouplingBlock(
|
98 |
+
inter_channels, flow_hidden_channels, flow_kernel_size, flow_n_layers)
|
99 |
+
self.flow_post = Flip()
|
100 |
+
self.dp = DurationPredictor(
|
101 |
+
inter_channels, filter_channels, kernel_size, p_dropout)
|
102 |
+
self.sdp = StochasticDurationPredictor(
|
103 |
+
inter_channels, filter_channels, kernel_size, p_dropout)
|
104 |
+
|
105 |
+
def forward(self, *args, **kwargs):
|
106 |
+
raise NotImplementedError("Training用 forwardは未実装です")
|
107 |
+
|
108 |
+
def infer(self, *args, **kwargs):
|
109 |
+
raise NotImplementedError("推論用 inferは未実装です")
|
src/sbv2/text/cleaners.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import unicodedata
|
3 |
+
|
4 |
+
_japanese_replacements = [
|
5 |
+
("‐", "-"),
|
6 |
+
("―", "-"),
|
7 |
+
("−", "-"),
|
8 |
+
("ー", "ー"),
|
9 |
+
("・", "、"),
|
10 |
+
(",", "、"),
|
11 |
+
(",", "、"),
|
12 |
+
(".", "。"),
|
13 |
+
(".", "。"),
|
14 |
+
]
|
15 |
+
|
16 |
+
def japanese_cleaners(text):
|
17 |
+
text = unicodedata.normalize("NFKC", text)
|
18 |
+
for pattern, replacement in _japanese_replacements:
|
19 |
+
text = text.replace(pattern, replacement)
|
20 |
+
text = re.sub(r"[()!?!?\[\]{}]", "", text) # 特殊記号除去
|
21 |
+
return text
|
22 |
+
|
23 |
+
# ⭐ ここを追加する!
|
24 |
+
basic_cleaners = japanese_cleaners
|
src/sbv2/text/symbols.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_pad = "_"
|
2 |
+
_punctuation = ",.!?-…~"
|
3 |
+
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
4 |
+
_japanese = "あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもやゆよらりるれろわをんアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヲンー"
|
5 |
+
_extra = "0123456789"
|
6 |
+
|
7 |
+
symbols = list(_pad + _punctuation + _letters + _japanese + _extra)
|
src/sbv2/text/text_to_sequence.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# text_to_sequence.py
|
2 |
+
|
3 |
+
from src.sbv2.text_encoder import text_to_sequence as encoder_text_to_sequence
|
4 |
+
from src.sbv2.text.cleaners import basic_cleaners
|
5 |
+
|
6 |
+
def text_to_sequence(text):
|
7 |
+
text = basic_cleaners(text)
|
8 |
+
return encoder_text_to_sequence(text)
|
src/sbv2/text_encoder.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# text_encoder.py
|
2 |
+
|
3 |
+
from src.sbv2.text.symbols import symbols
|
4 |
+
|
5 |
+
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
6 |
+
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
7 |
+
|
8 |
+
def text_to_sequence(text):
|
9 |
+
sequence = [_symbol_to_id[s] for s in text if s in _symbol_to_id]
|
10 |
+
return sequence
|
11 |
+
|
12 |
+
def sequence_to_text(sequence):
|
13 |
+
return ''.join([_id_to_symbol[i] for i in sequence if i in _id_to_symbol])
|
14 |
+
|