File size: 4,708 Bytes
13f5b7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
model:
  target: sgm.models.diffusion.DiffusionEngine
  params:
    scale_factor: 0.18215
    disable_first_stage_autocast: True
    ckpt_path: 

    denoiser_config:
      target: sgm.modules.diffusionmodules.denoiser.DenoiserDub
      params:
        scaling_config:
          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise

    network_wrapper: 
      target: sgm.modules.diffusionmodules.wrappers.InterpolationWrapper
      params:
        im_size: [512, 512] # USER: adapt this to your dataset
        n_channels: 4
        starting_mask_method: zeros
        add_mask: True

    network_config:
      target: sgm.modules.diffusionmodules.video_model.VideoUNet
      params:
        adm_in_channels: 0
        num_classes: sequential
        use_checkpoint: True
        in_channels: 9
        out_channels: 4
        model_channels: 320
        attention_resolutions: [4, 2, 1]
        num_res_blocks: 2
        channel_mult: [1, 2, 4, 4]
        num_head_channels: 64
        use_linear_in_transformer: True
        transformer_depth: 1
        context_dim: 1024
        spatial_transformer_attn_type: softmax-xformers
        extra_ff_mix_layer: True
        use_spatial_context: True
        merge_strategy: learned_with_images
        video_kernel_size: [3, 1, 1]
        fine_tuning_method: null
        audio_cond_method: both_keyframes
        additional_audio_frames: 0
        audio_dim: 1024
        unfreeze_blocks: ["input"] 

    conditioner_config:
      target: sgm.modules.GeneralConditioner
      params:
        emb_models:
   
        - input_key: cond_frames
          is_trainable: False
          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
          params:
            disable_encoder_autocast: True
            n_cond_frames: 2
            n_copies: 1
            is_ae: True
            encoder_config:
              target: sgm.models.autoencoder.AutoencoderKLModeOnly
              params:
                embed_dim: 4
                monitor: val/rec_loss
                ddconfig:
                  attn_type: vanilla-xformers
                  double_z: True
                  z_channels: 4
                  resolution: 256
                  in_channels: 3
                  out_ch: 3
                  ch: 128
                  ch_mult: [1, 2, 4, 4]
                  num_res_blocks: 2
                  attn_resolutions: []
                  dropout: 0.0
                lossconfig:
                  target: torch.nn.Identity

        - input_key: gt # allows to use the ground truth as a condition
          is_trainable: False
          target: sgm.modules.encoders.modules.IdentityEncoder
          params:
            cond_type: gt

        - input_key: audio_emb
          is_trainable: True
          target: sgm.modules.encoders.modules.WhisperAudioEmbedder
          params:
            merge_method: mean 
            linear_dim: 1024
            cond_type: crossattn
            audio_dim: 768

        - input_key: masks
          is_trainable: False
          target: sgm.modules.encoders.modules.IdentityEncoder
          params:
            cond_type: masks

    first_stage_config:
      target: sgm.models.autoencoder.AutoencodingEngine
      params:
        loss_config:
          target: torch.nn.Identity
        regularizer_config:
          target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
        encoder_config: 
          target: sgm.modules.diffusionmodules.model.Encoder
          params:
            attn_type: vanilla
            double_z: True
            z_channels: 4
            resolution: 256
            in_channels: 3
            out_ch: 3
            ch: 128
            ch_mult: [1, 2, 4, 4]
            num_res_blocks: 2
            attn_resolutions: []
            dropout: 0.0
        decoder_config:
          target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
          params:
            attn_type: vanilla
            double_z: True
            z_channels: 4
            resolution: 256
            in_channels: 3
            out_ch: 3
            ch: 128
            ch_mult: [1, 2, 4, 4]
            num_res_blocks: 2
            attn_resolutions: []
            dropout: 0.0
            video_kernel_size: [3, 1, 1]

    sampler_config:
      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
      params:
        num_steps: 10
        discretization_config:
          target: sgm.modules.diffusionmodules.discretizer.AYSDiscretization

        guider_config:
          target: sgm.modules.diffusionmodules.guiders.AudioRefMultiCondGuider
          params:
            audio_ratio: 5.0
            ref_ratio: 2.0