File size: 12,640 Bytes
1c72248
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
---
# This is in yaml format. You can use json if you prefer
# I like both but yaml is easier to write
# Plus it has comments which is nice for documentation
# This is the config I use on my sliders, It is solid and tested
job: train
config:
  # the name will be used to create a folder in the output folder
  # it will also replace any [name] token in the rest of this config
  name: detail_slider_v1
  # folder will be created with name above in folder below
  # it can be relative to the project root or absolute
  training_folder: "output/LoRA"
  device: cuda:0 # cpu, cuda:0, etc
  # for tensorboard logging, we will make a subfolder for this job
  log_dir: "output/.tensorboard"
  # you can stack processes for other jobs, It is not tested with sliders though
  # just use one for now
  process:
    - type: slider # tells runner to run the slider process
      # network is the LoRA network for a slider, I recommend to leave this be
      network:
        # network type lierla is traditional LoRA that works everywhere, only linear layers
        type: "lierla"
        # rank / dim of the network. Bigger is not always better. Especially for sliders. 8 is good
        linear: 8
        linear_alpha: 4 # Do about half of rank
      # training config
      train:
        # this is also used in sampling. Stick with ddpm unless you know what you are doing
        noise_scheduler: "ddpm" # or "ddpm", "lms", "euler_a"
        # how many steps to train. More is not always better. I rarely go over 1000
        steps: 500
        # I have had good results with 4e-4 to 1e-4 at 500 steps
        lr: 2e-4
        # enables gradient checkpoint, saves vram, leave it on
        gradient_checkpointing: true
        # train the unet. I recommend leaving this true
        train_unet: true
        # train the text encoder. I don't recommend this unless you have a special use case
        # for sliders we are adjusting representation of the concept (unet),
        # not the description of it (text encoder)
        train_text_encoder: false
        # same as from sd-scripts, not fully tested but should speed up training
        min_snr_gamma: 5.0
        # just leave unless you know what you are doing
        # also supports "dadaptation" but set lr to 1 if you use that,
        # but it learns too fast and I don't recommend it
        optimizer: "adamw"
        # only constant for now
        lr_scheduler: "constant"
        # we randomly denoise random num of steps form 1 to this number
        # while training. Just leave it
        max_denoising_steps: 40
        # works great at 1. I do 1 even with my 4090.
        # higher may not work right with newer single batch stacking code anyway
        batch_size: 1
        # bf16 works best if your GPU supports it (modern)
        dtype: bf16  # fp32, bf16, fp16
        # if you have it, use it. It is faster and better
        # torch 2.0 doesnt need xformers anymore, only use if you have lower version
#        xformers: true
        # I don't recommend using unless you are trying to make a darker lora. Then do 0.1 MAX
        # although, the way we train sliders is comparative, so it probably won't work anyway
        noise_offset: 0.0
#        noise_offset: 0.0357  # SDXL was trained with offset of 0.0357. So use that when training on SDXL

      # the model to train the LoRA network on
      model:
        # huggingface name, relative prom project path, or absolute path to .safetensors or .ckpt
        name_or_path: "runwayml/stable-diffusion-v1-5"
        is_v2: false  # for v2 models
        is_v_pred: false # for v-prediction models (most v2 models)
        # has some issues with the dual text encoder and the way we train sliders
        # it works bit weights need to probably be higher to see it.
        is_xl: false  # for SDXL models

      # saving config
      save:
        dtype: float16 # precision to save. I recommend float16
        save_every: 50 # save every this many steps
        # this will remove step counts more than this number
        # allows you to save more often in case of a crash without filling up your drive
        max_step_saves_to_keep: 2

      # sampling config
      sample:
        # must match train.noise_scheduler, this is not used here
        # but may be in future and in other processes
        sampler: "ddpm"
        # sample every this many steps
        sample_every: 20
        # image size
        width: 512
        height: 512
        # prompts to use for sampling. Do as many as you want, but it slows down training
        # pick ones that will best represent the concept you are trying to adjust
        # allows some flags after the prompt
        #  --m [number]  # network multiplier. LoRA weight. -3 for the negative slide, 3 for the positive
        #      slide are good tests. will inherit sample.network_multiplier if not set
        #  --n [string]  # negative prompt, will inherit sample.neg if not set
        # Only 75 tokens allowed currently
        # I like to do a wide positive and negative spread so I can see a good range and stop
        # early if the network is braking down
        prompts:
          - "a woman in a coffee shop, black hat, blonde hair, blue jacket --m -5"
          - "a woman in a coffee shop, black hat, blonde hair, blue jacket --m -3"
          - "a woman in a coffee shop, black hat, blonde hair, blue jacket --m 3"
          - "a woman in a coffee shop, black hat, blonde hair, blue jacket --m 5"
          - "a golden retriever sitting on a leather couch, --m -5"
          - "a golden retriever sitting on a leather couch --m -3"
          - "a golden retriever sitting on a leather couch --m 3"
          - "a golden retriever sitting on a leather couch --m 5"
          - "a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m -5"
          - "a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m -3"
          - "a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m 3"
          - "a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m 5"
        # negative prompt used on all prompts above as default if they don't have one
        neg: "cartoon, fake, drawing, illustration, cgi, animated, anime, monochrome"
        # seed for sampling. 42 is the answer for everything
        seed: 42
        # walks the seed so s1 is 42, s2 is 43, s3 is 44, etc
        # will start over on next sample_every so s1 is always seed
        # works well if you use same prompt but want different results
        walk_seed: false
        # cfg scale (4 to 10 is good)
        guidance_scale: 7
        # sampler steps (20 to 30 is good)
        sample_steps: 20
        # default network multiplier for all prompts
        # since we are training a slider, I recommend overriding this with --m [number]
        # in the prompts above to get both sides of the slider
        network_multiplier: 1.0

      # logging information
      logging:
        log_every: 10 # log every this many steps
        use_wandb: false # not supported yet
        verbose: false # probably done need unless you are debugging

      # slider training config, best for last
      slider:
        # resolutions to train on. [ width, height ]. This is less important for sliders
        # as we are not teaching the model anything it doesn't already know
        # but must be a size it understands [ 512, 512 ] for sd_v1.5  and [ 768, 768 ] for sd_v2.1
        # and [ 1024, 1024 ] for sd_xl
        # you can do as many as you want here
        resolutions:
          - [ 512, 512 ]
#          - [ 512, 768 ]
#          - [ 768, 768 ]
        # slider training uses 4 combined steps for a single round. This will do it in one gradient
        # step. It is highly optimized and shouldn't take anymore vram than doing without it,
        # since we break down batches for gradient accumulation now. so just leave it on.
        batch_full_slide: true
        # These are the concepts to train on. You can do as many as you want here,
        # but they can conflict outweigh each other. Other than experimenting, I recommend
        # just doing one for good results
        targets:
            # target_class is the base concept we are adjusting the representation of
            # for example, if we are adjusting the representation of a person, we would use "person"
            # if we are adjusting the representation of a cat, we would use "cat" It is not
            # a keyword necessarily but what the model understands the concept to represent.
            # "person" will affect men, women, children, etc but will not affect cats, dogs, etc
            # it is the models base general understanding of the concept and everything it represents
            # you can leave it blank to affect everything. In this example, we are adjusting
            # detail, so we will leave it blank to affect everything
          - target_class: ""
            # positive is the prompt for the positive side of the slider.
            # It is the concept that will be excited and amplified in the model when we slide the slider
            # to the positive side and forgotten / inverted when we slide
            # the slider to the negative side. It is generally best to include the target_class in
            # the prompt. You want it to be the extreme of what you want to train on. For example,
            # if you want to train on fat people, you would use "an extremely fat, morbidly obese person"
            # as the prompt. Not just "fat person"
            # max 75 tokens for now
            positive: "high detail, 8k, intricate, detailed, high resolution, high res, high quality"
            # negative is the prompt for the negative side of the slider and works the same as positive
            # it does not necessarily work the same as a negative prompt when generating images
            # these need to be polar opposites.
            # max 76 tokens for now
            negative: "blurry, boring, fuzzy, low detail, low resolution, low res, low quality"
            # the loss for this target is multiplied by this number.
            # if you are doing more than one target it may be good to set less important ones
            # to a lower number like 0.1 so they don't outweigh the primary target
            weight: 1.0
            # shuffle the prompts split by the comma. We will run every combination randomly
            # this will make the LoRA more robust. You probably want this on unless prompt order
            # is important for some reason
            shuffle: true


        # anchors are prompts that we will try to hold on to while training the slider
        # these are NOT necessary and can prevent the slider from converging if not done right
        # leave them off if you are having issues, but they can help lock the network
        # on certain concepts to help prevent catastrophic forgetting
        # you want these to generate an image that is not your target_class, but close to it
        # is fine as long as it does not directly overlap it.
        # For example, if you are training on a person smiling,
        # you could use "a person with a face mask" as an anchor. It is a person, the image is the same
        # regardless if they are smiling or not, however, the closer the concept is to the target_class
        # the less the multiplier needs to be. Keep multipliers less than 1.0 for anchors usually
        # for close concepts, you want to be closer to 0.1 or 0.2
        # these will slow down training. I am leaving them off for the demo

#        anchors:
#          - prompt: "a woman"
#            neg_prompt: "animal"
#            # the multiplier applied to the LoRA when this is run.
#            # higher will give it more weight but also help keep the lora from collapsing
#            multiplier: 1.0
#          - prompt: "a man"
#            neg_prompt: "animal"
#            multiplier: 1.0
#          - prompt: "a person"
#            neg_prompt: "animal"
#            multiplier: 1.0

# You can put any information you want here, and it will be saved in the model.
# The below is an example, but you can put your grocery list in it if you want.
# It is saved in the model so be aware of that. The software will include this
# plus some other information for you automatically
meta:
  # [name] gets replaced with the name above
  name: "[name]"
#  version: '1.0'
#  creator:
#    name: Your Name
#    email: [email protected]
#    website: https://your.website