mskrt commited on
Commit
7698974
verified
1 Parent(s): f828a58

Upload 18 files

Browse files
model_index.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "SuperDiffSDXLPipeline",
3
+ "_diffusers_version": "0.31.0",
4
+ "text_encoder": [
5
+ "transformers",
6
+ "CLIPTextModel"
7
+ ],
8
+ "text_encoder_2": [
9
+ "transformers",
10
+ "CLIPTextModelWithProjection"
11
+ ],
12
+ "tokenizer": [
13
+ "transformers",
14
+ "CLIPTokenizer"
15
+ ],
16
+ "tokenizer_2": [
17
+ "transformers",
18
+ "CLIPTokenizer"
19
+ ],
20
+ "unet": [
21
+ "diffusers",
22
+ "UNet2DConditionModel"
23
+ ],
24
+ "vae": [
25
+ "diffusers",
26
+ "AutoencoderKL"
27
+ ]
28
+ }
pipeline.py ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from typing import Callable, Dict, List, Optional
3
+
4
+ import torch
5
+ from diffusers import DiffusionPipeline
6
+ from diffusers.configuration_utils import ConfigMixin
7
+ from tqdm import tqdm
8
+
9
+ # from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
10
+ # from diffusers import AutoencoderKL, UNet2DConditionModel
11
+
12
+
13
+ def get_scaled_coeffs():
14
+ """get_scaled_coeffs.
15
+ """
16
+ beta_min = 0.85
17
+ beta_max = 12.0
18
+ return beta_min**0.5, beta_max**0.5-beta_min**0.5
19
+
20
+
21
+ def beta(t):
22
+ """beta.
23
+
24
+ Parameters
25
+ ----------
26
+ t :
27
+ t
28
+ """
29
+ a, b = get_scaled_coeffs()
30
+ return (a+t*b)**2
31
+
32
+
33
+ def int_beta(t):
34
+ """int_beta.
35
+
36
+ Parameters
37
+ ----------
38
+ t :
39
+ t
40
+ """
41
+ a, b = get_scaled_coeffs()
42
+ return ((a+b*t)**3-a**3)/(3*b)
43
+ def sigma(t):
44
+ """sigma.
45
+
46
+ Parameters
47
+ ----------
48
+ t :
49
+ t
50
+ """
51
+ return torch.expm1(int_beta(t))**0.5
52
+ def sigma_orig(t):
53
+ """sigma_orig.
54
+
55
+ Parameters
56
+ ----------
57
+ t :
58
+ t
59
+ """
60
+ return (-torch.expm1(-int_beta(t)))**0.5
61
+
62
+ class SuperDiffSDXLPipeline(DiffusionPipeline, ConfigMixin):
63
+ """SuperDiffSDXLPipeline."""
64
+
65
+ def __init__(self, unet: Callable, vae: Callable, text_encoder: Callable, text_encoder_2: Callable, tokenizer: Callable, tokenizer_2: Callable) -> None:
66
+
67
+ """__init__.
68
+
69
+ Parameters
70
+ ----------
71
+ model : Callable
72
+ model
73
+ vae : Callable
74
+ vae
75
+ text_encoder : Callable
76
+ text_encoder
77
+ scheduler : Callable
78
+ scheduler
79
+ tokenizer : Callable
80
+ tokenizer
81
+ kwargs :
82
+ kwargs
83
+
84
+ Returns
85
+ -------
86
+ None
87
+
88
+ """
89
+ super().__init__()
90
+ device = "cuda" if torch.cuda.is_available() else "cpu"
91
+ dtype=torch.float16
92
+
93
+ vae.to(device)
94
+ unet.to(device)
95
+ text_encoder.to(device)
96
+ text_encoder_2.to(device)
97
+
98
+ self.register_modules(unet=unet,
99
+ vae=vae,
100
+ text_encoder=text_encoder,
101
+ text_encoder_2=text_encoder_2,
102
+ tokenizer=tokenizer,
103
+ tokenizer_2=tokenizer_2,
104
+ )
105
+
106
+ def prepare_prompt_input(self, prompt_o, prompt_b, batch_size, height, width):
107
+ """prepare_prompt_input.
108
+
109
+ Parameters
110
+ ----------
111
+ prompt_o :
112
+ prompt_o
113
+ prompt_b :
114
+ prompt_b
115
+ batch_size :
116
+ batch_size
117
+ height :
118
+ height
119
+ width :
120
+ width
121
+ """
122
+ text_input = self.tokenizer(prompt_o* batch_size, padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, return_tensors="pt")
123
+ text_input_2 = self.tokenizer_2(prompt_o* batch_size, padding="max_length", max_length=self.tokenizer_2.model_max_length, truncation=True, return_tensors="pt")
124
+ with torch.no_grad():
125
+ text_embeddings = self.text_encoder(text_input.input_ids.to(self.device), output_hidden_states=True)
126
+ text_embeddings_2 = self.text_encoder_2(text_input_2.input_ids.to(self.device), output_hidden_states=True)
127
+ prompt_embeds_o = torch.concat((text_embeddings.hidden_states[-2], text_embeddings_2.hidden_states[-2]), dim=-1)
128
+ pooled_prompt_embeds_o = text_embeddings_2[0]
129
+ negative_prompt_embeds = torch.zeros_like(prompt_embeds_o)
130
+ negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds_o)
131
+
132
+ text_input = self.tokenizer(prompt_b* batch_size, padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, return_tensors="pt")
133
+ text_input_2 = self.tokenizer_2(prompt_b* batch_size, padding="max_length", max_length=self.tokenizer_2.model_max_length, truncation=True, return_tensors="pt")
134
+ with torch.no_grad():
135
+ text_embeddings = self.text_encoder(text_input.input_ids.to(self.device), output_hidden_states=True)
136
+ text_embeddings_2 = self.text_encoder_2(text_input_2.input_ids.to(self.device), output_hidden_states=True)
137
+ prompt_embeds_b = torch.concat((text_embeddings.hidden_states[-2], text_embeddings_2.hidden_states[-2]), dim=-1)
138
+ pooled_prompt_embeds_b = text_embeddings_2[0]
139
+ add_time_ids_o = torch.tensor([(height,width,0,0,height,width)])
140
+ add_time_ids_b = torch.tensor([(height,width,0,0,height,width)])
141
+ negative_add_time_ids = torch.tensor([(height,width,0,0,height,width)])
142
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds_o, prompt_embeds_b], dim=0)
143
+ add_text_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds_o, pooled_prompt_embeds_b], dim=0)
144
+ add_time_ids = torch.cat([negative_add_time_ids, add_time_ids_o, add_time_ids_b], dim=0)
145
+
146
+ prompt_embeds = prompt_embeds.to(self.device)
147
+ add_text_embeds = add_text_embeds.to(self.device)
148
+ add_time_ids = add_time_ids.to(self.device).repeat(batch_size, 1)
149
+ added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
150
+ return prompt_embeds, added_cond_kwargs
151
+
152
+ @torch.no_grad
153
+ def get_batch(self, latents: Callable, nrow: int, ncol: int) -> Callable:
154
+ """get_batch.
155
+
156
+ Parameters
157
+ ----------
158
+ latents : Callable
159
+ latents
160
+ nrow : int
161
+ nrow
162
+ ncol : int
163
+ ncol
164
+
165
+ Returns
166
+ -------
167
+ Callable
168
+
169
+ """
170
+ image = self.vae.decode(
171
+ latents / self.vae.config.scaling_factor, return_dict=False
172
+ )[0]
173
+ image = (image / 2 + 0.5).clamp(0, 1).squeeze()
174
+ if len(image.shape) < 4:
175
+ image = image.unsqueeze(0)
176
+ image = (image.permute(0, 2, 3, 1) * 255).to(torch.uint8)
177
+ return image
178
+
179
+ @torch.no_grad
180
+ def get_text_embedding(self, prompt: str) -> Callable:
181
+ """get_text_embedding.
182
+
183
+ Parameters
184
+ ----------
185
+ prompt : str
186
+ prompt
187
+
188
+ Returns
189
+ -------
190
+ Callable
191
+
192
+ """
193
+ text_input = self.tokenizer(
194
+ prompt,
195
+ padding="max_length",
196
+ max_length=self.tokenizer.model_max_length,
197
+ truncation=True,
198
+ return_tensors="pt",
199
+ )
200
+ return self.text_encoder(text_input.input_ids.to(self.device))[0]
201
+
202
+ @torch.no_grad
203
+ def get_vel(self, t: float, sigma: float, latents: Callable, embeddings: Callable):
204
+ """get_vel.
205
+
206
+ Parameters
207
+ ----------
208
+ t : float
209
+ t
210
+ sigma : float
211
+ sigma
212
+ latents : Callable
213
+ latents
214
+ embeddings : Callable
215
+ embeddings
216
+ """
217
+ def v(_x, _e): return self.model(
218
+ """v.
219
+
220
+ Parameters
221
+ ----------
222
+ _x :
223
+ _x
224
+ _e :
225
+ _e
226
+ """
227
+ _x / ((sigma**2 + 1) ** 0.5), t, encoder_hidden_states=_e
228
+ ).sample
229
+ embeds = torch.cat(embeddings)
230
+ latent_input = latents
231
+ vel = v(latent_input, embeds)
232
+ return vel
233
+
234
+ def preprocess(
235
+ self,
236
+ prompt_1: str,
237
+ prompt_2: str,
238
+ seed: int = None,
239
+ num_inference_steps: int = 200,
240
+ batch_size: int = 1,
241
+ height: int = 1024,
242
+ width: int = 1024,
243
+ guidance_scale: float = 7.5,
244
+ ) -> Callable:
245
+ """preprocess.
246
+
247
+ Parameters
248
+ ----------
249
+ prompt_1 : str
250
+ prompt_1
251
+ prompt_2 : str
252
+ prompt_2
253
+ seed : int
254
+ seed
255
+ num_inference_steps : int
256
+ num_inference_steps
257
+ batch_size : int
258
+ batch_size
259
+ height : int
260
+ height
261
+ width : int
262
+ width
263
+ guidance_scale : float
264
+ guidance_scale
265
+
266
+ Returns
267
+ -------
268
+ Callable
269
+
270
+ """
271
+ # Tokenize the input
272
+ self.batch_size = batch_size
273
+ self.num_inference_steps = num_inference_steps
274
+ self.guidance_scale = guidance_scale
275
+ self.seed = seed
276
+ if self.seed is None:
277
+ self.seed = random.randint(0, 2**32 - 1)
278
+
279
+ self.generator = torch.cuda.manual_seed(
280
+ self.seed
281
+ ) # Seed generator to create the initial latent noise
282
+
283
+ latents = torch.randn((batch_size, self.unet.in_channels, height // 8, width // 8), generator=self.generator, dtype=self.dtype, device=self.device,)
284
+ prompt_embeds, added_cond_kwargs = self.prepare_prompt_input(prompt_1, prompt_2, batch_size, height, width)
285
+
286
+ return {
287
+ "latents": latents,
288
+ "prompt_embeds": prompt_embeds,
289
+ "added_cond_kwargs": added_cond_kwargs,
290
+ }
291
+
292
+ def _forward(self, model_inputs: Dict) -> Callable:
293
+ """_forward.
294
+
295
+ Parameters
296
+ ----------
297
+ model_inputs : Dict
298
+ model_inputs
299
+
300
+ Returns
301
+ -------
302
+ Callable
303
+
304
+ """
305
+ latents = model_inputs["latents"]
306
+ prompt_embeds = model_inputs["prompt_embeds"]
307
+ added_cond_kwargs = model_inputs["added_cond_kwargs"]
308
+
309
+ t = torch.tensor(1.0)
310
+ dt = 1.0/self.num_inference_steps
311
+ train_number_steps = 1000
312
+ latents = latents * (sigma(t)**2+1)**0.5
313
+ with torch.no_grad():
314
+ for i in tqdm(range(self.num_inference_steps)):
315
+ latent_model_input = torch.cat([latents] * 3)
316
+ sigma_t = sigma(t)
317
+ dsigma = sigma(t-dt) - sigma_t
318
+ latent_model_input /= (sigma_t**2+1)**0.5
319
+ with torch.no_grad():
320
+ noise_pred = self.unet(latent_model_input, t*train_number_steps, encoder_hidden_states=prompt_embeds, added_cond_kwargs=added_cond_kwargs, return_dict=False)[0]
321
+
322
+ noise_pred_uncond, noise_pred_text_o, noise_pred_text_b = noise_pred.chunk(3)
323
+
324
+ # noise = torch.sqrt(2*torch.abs(dsigma)*sigma_t)*torch.randn_like(latents)
325
+ noise = torch.sqrt(2*torch.abs(dsigma)*sigma_t)*torch.empty_like(latents, device=self.device).normal_(generator=self.generator)
326
+
327
+ dx_ind = 2*dsigma*(noise_pred_uncond + self.guidance_scale*(noise_pred_text_b - noise_pred_uncond)) + noise
328
+ kappa = (torch.abs(dsigma)*(noise_pred_text_b-noise_pred_text_o)*(noise_pred_text_b+noise_pred_text_o)).sum((1,2,3))-(dx_ind*((noise_pred_text_o-noise_pred_text_b))).sum((1,2,3))
329
+ kappa /= 2*dsigma*self.guidance_scale*((noise_pred_text_o-noise_pred_text_b)**2).sum((1,2,3))
330
+ noise_pred = noise_pred_uncond + self.guidance_scale*((noise_pred_text_b - noise_pred_uncond) + kappa[:,None,None,None]*(noise_pred_text_o-noise_pred_text_b))
331
+
332
+ if i < self.num_inference_steps - 1:
333
+ latents += 2*dsigma * noise_pred + noise
334
+ else:
335
+ latents += dsigma * noise_pred
336
+
337
+ t -= dt
338
+ return latents
339
+
340
+ def postprocess(self, latents: Callable) -> Callable:
341
+ """postprocess.
342
+
343
+ Parameters
344
+ ----------
345
+ latents : Callable
346
+ latents
347
+
348
+ Returns
349
+ -------
350
+ Callable
351
+
352
+ """
353
+ latents = latents/self.vae.config.scaling_factor
354
+ latents = latents.to(torch.float32)
355
+ with torch.no_grad():
356
+ image = self.vae.decode(latents, return_dict=False)[0]
357
+
358
+ image = (image / 2 + 0.5).clamp(0, 1)
359
+ image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
360
+ images = (image * 255).round().astype("uint8")
361
+ return images
362
+
363
+ def __call__(
364
+ self,
365
+ prompt_1: str,
366
+ prompt_2: str,
367
+ seed: int = None,
368
+ num_inference_steps: int = 200,
369
+ batch_size: int = 1,
370
+ height: int = 1024,
371
+ width: int = 1024,
372
+ guidance_scale: float = 7.5,
373
+ ) -> Callable:
374
+ """__call__.
375
+
376
+ Parameters
377
+ ----------
378
+ prompt_1 : str
379
+ prompt_1
380
+ prompt_2 : str
381
+ prompt_2
382
+ seed : int
383
+ seed
384
+ num_inference_steps : int
385
+ num_inference_steps
386
+ batch_size : int
387
+ batch_size
388
+ height : int
389
+ height
390
+ width : int
391
+ width
392
+ guidance_scale : int
393
+ guidance_scale
394
+
395
+ Returns
396
+ -------
397
+ Callable
398
+
399
+ """
400
+ # Preprocess inputs
401
+ model_inputs = self.preprocess(
402
+ prompt_1,
403
+ prompt_2,
404
+ seed,
405
+ num_inference_steps,
406
+ batch_size,
407
+ height,
408
+ width,
409
+ guidance_scale,
410
+ )
411
+
412
+ # Forward pass through the pipeline
413
+ latents = self._forward(model_inputs)
414
+
415
+ # Postprocess to generate the final output
416
+ images = self.postprocess(latents)
417
+ return images
text_encoder/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "stabilityai/stable-diffusion-xl-base-1.0",
3
+ "architectures": [
4
+ "CLIPTextModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dropout": 0.0,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "quick_gelu",
11
+ "hidden_size": 768,
12
+ "initializer_factor": 1.0,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 77,
17
+ "model_type": "clip_text_model",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 1,
21
+ "projection_dim": 768,
22
+ "torch_dtype": "float16",
23
+ "transformers_version": "4.46.2",
24
+ "vocab_size": 49408
25
+ }
text_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:660c6f5b1abae9dc498ac2d21e1347d2abdb0cf6c0c0c8576cd796491d9a6cdd
3
+ size 246144152
text_encoder_2/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "stabilityai/stable-diffusion-xl-base-1.0",
3
+ "architectures": [
4
+ "CLIPTextModelWithProjection"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dropout": 0.0,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_size": 1280,
12
+ "initializer_factor": 1.0,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 5120,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 77,
17
+ "model_type": "clip_text_model",
18
+ "num_attention_heads": 20,
19
+ "num_hidden_layers": 32,
20
+ "pad_token_id": 1,
21
+ "projection_dim": 1280,
22
+ "torch_dtype": "float16",
23
+ "transformers_version": "4.46.2",
24
+ "vocab_size": 49408
25
+ }
text_encoder_2/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec310df2af79c318e24d20511b601a591ca8cd4f1fce1d8dff822a356bcdb1f4
3
+ size 1389382176
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "49406": {
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49407": {
13
+ "content": "<|endoftext|>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ }
20
+ },
21
+ "bos_token": "<|startoftext|>",
22
+ "clean_up_tokenization_spaces": true,
23
+ "do_lower_case": true,
24
+ "eos_token": "<|endoftext|>",
25
+ "errors": "replace",
26
+ "model_max_length": 77,
27
+ "pad_token": "<|endoftext|>",
28
+ "tokenizer_class": "CLIPTokenizer",
29
+ "unk_token": "<|endoftext|>"
30
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_2/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_2/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "!",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer_2/tokenizer_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "!",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49406": {
13
+ "content": "<|startoftext|>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "49407": {
21
+ "content": "<|endoftext|>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "bos_token": "<|startoftext|>",
30
+ "clean_up_tokenization_spaces": true,
31
+ "do_lower_case": true,
32
+ "eos_token": "<|endoftext|>",
33
+ "errors": "replace",
34
+ "model_max_length": 77,
35
+ "pad_token": "!",
36
+ "tokenizer_class": "CLIPTokenizer",
37
+ "unk_token": "<|endoftext|>"
38
+ }
tokenizer_2/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
unet/config.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.31.0",
4
+ "_name_or_path": "stabilityai/stable-diffusion-xl-base-1.0",
5
+ "act_fn": "silu",
6
+ "addition_embed_type": "text_time",
7
+ "addition_embed_type_num_heads": 64,
8
+ "addition_time_embed_dim": 256,
9
+ "attention_head_dim": [
10
+ 5,
11
+ 10,
12
+ 20
13
+ ],
14
+ "attention_type": "default",
15
+ "block_out_channels": [
16
+ 320,
17
+ 640,
18
+ 1280
19
+ ],
20
+ "center_input_sample": false,
21
+ "class_embed_type": null,
22
+ "class_embeddings_concat": false,
23
+ "conv_in_kernel": 3,
24
+ "conv_out_kernel": 3,
25
+ "cross_attention_dim": 2048,
26
+ "cross_attention_norm": null,
27
+ "down_block_types": [
28
+ "DownBlock2D",
29
+ "CrossAttnDownBlock2D",
30
+ "CrossAttnDownBlock2D"
31
+ ],
32
+ "downsample_padding": 1,
33
+ "dropout": 0.0,
34
+ "dual_cross_attention": false,
35
+ "encoder_hid_dim": null,
36
+ "encoder_hid_dim_type": null,
37
+ "flip_sin_to_cos": true,
38
+ "freq_shift": 0,
39
+ "in_channels": 4,
40
+ "layers_per_block": 2,
41
+ "mid_block_only_cross_attention": null,
42
+ "mid_block_scale_factor": 1,
43
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
44
+ "norm_eps": 1e-05,
45
+ "norm_num_groups": 32,
46
+ "num_attention_heads": null,
47
+ "num_class_embeds": null,
48
+ "only_cross_attention": false,
49
+ "out_channels": 4,
50
+ "projection_class_embeddings_input_dim": 2816,
51
+ "resnet_out_scale_factor": 1.0,
52
+ "resnet_skip_time_act": false,
53
+ "resnet_time_scale_shift": "default",
54
+ "reverse_transformer_layers_per_block": null,
55
+ "sample_size": 128,
56
+ "time_cond_proj_dim": null,
57
+ "time_embedding_act_fn": null,
58
+ "time_embedding_dim": null,
59
+ "time_embedding_type": "positional",
60
+ "timestep_post_act": null,
61
+ "transformer_layers_per_block": [
62
+ 1,
63
+ 2,
64
+ 10
65
+ ],
66
+ "up_block_types": [
67
+ "CrossAttnUpBlock2D",
68
+ "CrossAttnUpBlock2D",
69
+ "UpBlock2D"
70
+ ],
71
+ "upcast_attention": null,
72
+ "use_linear_projection": true
73
+ }
unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83e012a805b84c7ca28e5646747c90a243c65c8ba4f070e2d7ddc9d74661e139
3
+ size 5135149760
vae/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.31.0",
4
+ "_name_or_path": "stabilityai/stable-diffusion-xl-base-1.0",
5
+ "act_fn": "silu",
6
+ "block_out_channels": [
7
+ 128,
8
+ 256,
9
+ 512,
10
+ 512
11
+ ],
12
+ "down_block_types": [
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D",
16
+ "DownEncoderBlock2D"
17
+ ],
18
+ "force_upcast": true,
19
+ "in_channels": 3,
20
+ "latent_channels": 4,
21
+ "latents_mean": null,
22
+ "latents_std": null,
23
+ "layers_per_block": 2,
24
+ "mid_block_add_attention": true,
25
+ "norm_num_groups": 32,
26
+ "out_channels": 3,
27
+ "sample_size": 1024,
28
+ "scaling_factor": 0.13025,
29
+ "shift_factor": null,
30
+ "up_block_types": [
31
+ "UpDecoderBlock2D",
32
+ "UpDecoderBlock2D",
33
+ "UpDecoderBlock2D",
34
+ "UpDecoderBlock2D"
35
+ ],
36
+ "use_post_quant_conv": true,
37
+ "use_quant_conv": true
38
+ }
vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1598f3d24932bcfe6634e8b618ea1e30ab1d57f5aad13a6d2de446d2199f2341
3
+ size 334643268