Seed-VC / configs /v2 /vc_wrapper.yaml
Plachta's picture
Upload 116 files
56a1295 verified
raw
history blame contribute delete
3 kB
_target_: modules.v2.vc_wrapper.VoiceConversionWrapper
sr: 22050
hop_size: 256
mel_fn:
_target_: modules.audio.mel_spectrogram
_partial_: true
n_fft: 1024
win_size: 1024
hop_size: 256
num_mels: 80
sampling_rate: 22050
fmin: 0
fmax: null
center: False
cfm:
_target_: modules.v2.cfm.CFM
estimator:
_target_: modules.v2.dit_wrapper.DiT
time_as_token: true
style_as_token: true
uvit_skip_connection: false
block_size: 8192
depth: 13
num_heads: 8
hidden_dim: 512
in_channels: 80
content_dim: 512
style_encoder_dim: 192
class_dropout_prob: 0.1
dropout_rate: 0.0
attn_dropout_rate: 0.0
cfm_length_regulator:
_target_: modules.v2.length_regulator.InterpolateRegulator
channels: 512
is_discrete: true
codebook_size: 2048
sampling_ratios: [ 1, 1, 1, 1 ]
f0_condition: false
ar:
_target_: modules.v2.ar.NaiveWrapper
model:
_target_: modules.v2.ar.NaiveTransformer
config:
_target_: modules.v2.ar.NaiveModelArgs
dropout: 0.0
rope_base: 10000.0
dim: 768
head_dim: 64
n_local_heads: 2
intermediate_size: 2304
n_head: 12
n_layer: 12
vocab_size: 2049 # 1 + 1 for eos
ar_length_regulator:
_target_: modules.v2.length_regulator.InterpolateRegulator
channels: 768
is_discrete: true
codebook_size: 32
sampling_ratios: [ ]
f0_condition: false
style_encoder:
_target_: modules.campplus.DTDNN.CAMPPlus
feat_dim: 80
embedding_size: 192
content_extractor_narrow:
_target_: modules.astral_quantization.default_model.AstralQuantizer
tokenizer_name: "openai/whisper-small"
ssl_model_name: "facebook/hubert-large-ll60k"
ssl_output_layer: 18
skip_ssl: true
encoder: &bottleneck_encoder
_target_: modules.astral_quantization.convnext.ConvNeXtV2Stage
dim: 512
num_blocks: 12
intermediate_dim: 1536
dilation: 1
input_dim: 1024
quantizer:
_target_: modules.astral_quantization.bsq.BinarySphericalQuantize
codebook_size: 32 # codebook size, must be a power of 2
dim: 512
entropy_loss_weight: 0.1
diversity_gamma: 1.0
spherical: True
enable_entropy_loss: True
soft_entropy_loss: True
content_extractor_wide:
_target_: modules.astral_quantization.default_model.AstralQuantizer
tokenizer_name: "openai/whisper-small"
ssl_model_name: "facebook/hubert-large-ll60k"
ssl_output_layer: 18
encoder: *bottleneck_encoder
quantizer:
_target_: modules.astral_quantization.bsq.BinarySphericalQuantize
codebook_size: 2048 # codebook size, must be a power of 2
dim: 512
entropy_loss_weight: 0.1
diversity_gamma: 1.0
spherical: True
enable_entropy_loss: True
soft_entropy_loss: True
vocoder:
_target_: modules.bigvgan.bigvgan.BigVGAN.from_pretrained
pretrained_model_name_or_path: "nvidia/bigvgan_v2_22khz_80band_256x"
use_cuda_kernel: false