prithivMLmods commited on
Commit
897c523
·
verified ·
1 Parent(s): a16f6e8

Create transformer_flux.py

Browse files
Files changed (1) hide show
  1. transformer_flux.py +525 -0
transformer_flux.py ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, List, Optional, Union
2
+
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+
8
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
9
+ from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
10
+ from diffusers.models.attention import FeedForward
11
+ from diffusers.models.attention_processor import (
12
+ Attention,
13
+ FluxAttnProcessor2_0,
14
+ FluxSingleAttnProcessor2_0,
15
+ )
16
+ from diffusers.models.modeling_utils import ModelMixin
17
+ from diffusers.models.normalization import (
18
+ AdaLayerNormContinuous,
19
+ AdaLayerNormZero,
20
+ AdaLayerNormZeroSingle,
21
+ )
22
+ from diffusers.utils import (
23
+ USE_PEFT_BACKEND,
24
+ is_torch_version,
25
+ logging,
26
+ scale_lora_layers,
27
+ unscale_lora_layers,
28
+ )
29
+ from diffusers.utils.torch_utils import maybe_allow_in_graph
30
+ from diffusers.models.embeddings import (
31
+ CombinedTimestepGuidanceTextProjEmbeddings,
32
+ CombinedTimestepTextProjEmbeddings,
33
+ )
34
+ from diffusers.models.modeling_outputs import Transformer2DModelOutput
35
+
36
+
37
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
38
+
39
+
40
+ # YiYi to-do: refactor rope related functions/classes
41
+ def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
42
+ assert dim % 2 == 0, "The dimension must be even."
43
+
44
+ scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
45
+ omega = 1.0 / (theta**scale)
46
+
47
+ batch_size, seq_length = pos.shape
48
+ out = torch.einsum("...n,d->...nd", pos, omega)
49
+ cos_out = torch.cos(out)
50
+ sin_out = torch.sin(out)
51
+
52
+ stacked_out = torch.stack([cos_out, -sin_out, sin_out, cos_out], dim=-1)
53
+ out = stacked_out.view(batch_size, -1, dim // 2, 2, 2)
54
+ return out.float()
55
+
56
+
57
+ # YiYi to-do: refactor rope related functions/classes
58
+ class EmbedND(nn.Module):
59
+ def __init__(self, dim: int, theta: int, axes_dim: List[int]):
60
+ super().__init__()
61
+ self.dim = dim
62
+ self.theta = theta
63
+ self.axes_dim = axes_dim
64
+
65
+ def forward(self, ids: torch.Tensor) -> torch.Tensor:
66
+ n_axes = ids.shape[-1]
67
+ emb = torch.cat(
68
+ [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
69
+ dim=-3,
70
+ )
71
+ return emb.unsqueeze(1)
72
+
73
+
74
+ @maybe_allow_in_graph
75
+ class FluxSingleTransformerBlock(nn.Module):
76
+ r"""
77
+ A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
78
+
79
+ Reference: https://arxiv.org/abs/2403.03206
80
+
81
+ Parameters:
82
+ dim (`int`): The number of channels in the input and output.
83
+ num_attention_heads (`int`): The number of heads to use for multi-head attention.
84
+ attention_head_dim (`int`): The number of channels in each head.
85
+ context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
86
+ processing of `context` conditions.
87
+ """
88
+
89
+ def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0):
90
+ super().__init__()
91
+ self.mlp_hidden_dim = int(dim * mlp_ratio)
92
+
93
+ self.norm = AdaLayerNormZeroSingle(dim)
94
+ self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
95
+ self.act_mlp = nn.GELU(approximate="tanh")
96
+ self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
97
+
98
+ processor = FluxSingleAttnProcessor2_0()
99
+ self.attn = Attention(
100
+ query_dim=dim,
101
+ cross_attention_dim=None,
102
+ dim_head=attention_head_dim,
103
+ heads=num_attention_heads,
104
+ out_dim=dim,
105
+ bias=True,
106
+ processor=processor,
107
+ qk_norm="rms_norm",
108
+ eps=1e-6,
109
+ pre_only=True,
110
+ )
111
+
112
+ def forward(
113
+ self,
114
+ hidden_states: torch.FloatTensor,
115
+ temb: torch.FloatTensor,
116
+ image_rotary_emb=None,
117
+ ):
118
+ residual = hidden_states
119
+ norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
120
+ mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
121
+
122
+ attn_output = self.attn(
123
+ hidden_states=norm_hidden_states,
124
+ image_rotary_emb=image_rotary_emb,
125
+ )
126
+
127
+ hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
128
+ gate = gate.unsqueeze(1)
129
+ hidden_states = gate * self.proj_out(hidden_states)
130
+ hidden_states = residual + hidden_states
131
+ if hidden_states.dtype == torch.float16:
132
+ hidden_states = hidden_states.clip(-65504, 65504)
133
+
134
+ return hidden_states
135
+
136
+
137
+ @maybe_allow_in_graph
138
+ class FluxTransformerBlock(nn.Module):
139
+ r"""
140
+ A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
141
+
142
+ Reference: https://arxiv.org/abs/2403.03206
143
+
144
+ Parameters:
145
+ dim (`int`): The number of channels in the input and output.
146
+ num_attention_heads (`int`): The number of heads to use for multi-head attention.
147
+ attention_head_dim (`int`): The number of channels in each head.
148
+ context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
149
+ processing of `context` conditions.
150
+ """
151
+
152
+ def __init__(
153
+ self, dim, num_attention_heads, attention_head_dim, qk_norm="rms_norm", eps=1e-6
154
+ ):
155
+ super().__init__()
156
+
157
+ self.norm1 = AdaLayerNormZero(dim)
158
+
159
+ self.norm1_context = AdaLayerNormZero(dim)
160
+
161
+ if hasattr(F, "scaled_dot_product_attention"):
162
+ processor = FluxAttnProcessor2_0()
163
+ else:
164
+ raise ValueError(
165
+ "The current PyTorch version does not support the `scaled_dot_product_attention` function."
166
+ )
167
+ self.attn = Attention(
168
+ query_dim=dim,
169
+ cross_attention_dim=None,
170
+ added_kv_proj_dim=dim,
171
+ dim_head=attention_head_dim,
172
+ heads=num_attention_heads,
173
+ out_dim=dim,
174
+ context_pre_only=False,
175
+ bias=True,
176
+ processor=processor,
177
+ qk_norm=qk_norm,
178
+ eps=eps,
179
+ )
180
+
181
+ self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
182
+ self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
183
+
184
+ self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
185
+ self.ff_context = FeedForward(
186
+ dim=dim, dim_out=dim, activation_fn="gelu-approximate"
187
+ )
188
+
189
+ # let chunk size default to None
190
+ self._chunk_size = None
191
+ self._chunk_dim = 0
192
+
193
+ def forward(
194
+ self,
195
+ hidden_states: torch.FloatTensor,
196
+ encoder_hidden_states: torch.FloatTensor,
197
+ temb: torch.FloatTensor,
198
+ image_rotary_emb=None,
199
+ ):
200
+ norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
201
+ hidden_states, emb=temb
202
+ )
203
+
204
+ (
205
+ norm_encoder_hidden_states,
206
+ c_gate_msa,
207
+ c_shift_mlp,
208
+ c_scale_mlp,
209
+ c_gate_mlp,
210
+ ) = self.norm1_context(encoder_hidden_states, emb=temb)
211
+
212
+ # Attention.
213
+ attn_output, context_attn_output = self.attn(
214
+ hidden_states=norm_hidden_states,
215
+ encoder_hidden_states=norm_encoder_hidden_states,
216
+ image_rotary_emb=image_rotary_emb,
217
+ )
218
+
219
+ # Process attention outputs for the `hidden_states`.
220
+ attn_output = gate_msa.unsqueeze(1) * attn_output
221
+ hidden_states = hidden_states + attn_output
222
+
223
+ norm_hidden_states = self.norm2(hidden_states)
224
+ norm_hidden_states = (
225
+ norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
226
+ )
227
+
228
+ ff_output = self.ff(norm_hidden_states)
229
+ ff_output = gate_mlp.unsqueeze(1) * ff_output
230
+
231
+ hidden_states = hidden_states + ff_output
232
+
233
+ # Process attention outputs for the `encoder_hidden_states`.
234
+
235
+ context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
236
+ encoder_hidden_states = encoder_hidden_states + context_attn_output
237
+
238
+ norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
239
+ norm_encoder_hidden_states = (
240
+ norm_encoder_hidden_states * (1 + c_scale_mlp[:, None])
241
+ + c_shift_mlp[:, None]
242
+ )
243
+
244
+ context_ff_output = self.ff_context(norm_encoder_hidden_states)
245
+ encoder_hidden_states = (
246
+ encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
247
+ )
248
+ if encoder_hidden_states.dtype == torch.float16:
249
+ encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
250
+
251
+ return encoder_hidden_states, hidden_states
252
+
253
+
254
+ class FluxTransformer2DModel(
255
+ ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin
256
+ ):
257
+ """
258
+ The Transformer model introduced in Flux.
259
+
260
+ Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
261
+
262
+ Parameters:
263
+ patch_size (`int`): Patch size to turn the input data into small patches.
264
+ in_channels (`int`, *optional*, defaults to 16): The number of channels in the input.
265
+ num_layers (`int`, *optional*, defaults to 18): The number of layers of MMDiT blocks to use.
266
+ num_single_layers (`int`, *optional*, defaults to 18): The number of layers of single DiT blocks to use.
267
+ attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
268
+ num_attention_heads (`int`, *optional*, defaults to 18): The number of heads to use for multi-head attention.
269
+ joint_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
270
+ pooled_projection_dim (`int`): Number of dimensions to use when projecting the `pooled_projections`.
271
+ guidance_embeds (`bool`, defaults to False): Whether to use guidance embeddings.
272
+ """
273
+
274
+ _supports_gradient_checkpointing = True
275
+
276
+ @register_to_config
277
+ def __init__(
278
+ self,
279
+ patch_size: int = 1,
280
+ in_channels: int = 64,
281
+ num_layers: int = 19,
282
+ num_single_layers: int = 38,
283
+ attention_head_dim: int = 128,
284
+ num_attention_heads: int = 24,
285
+ joint_attention_dim: int = 4096,
286
+ pooled_projection_dim: int = 768,
287
+ guidance_embeds: bool = False,
288
+ axes_dims_rope: List[int] = [16, 56, 56],
289
+ ):
290
+ super().__init__()
291
+ self.out_channels = in_channels
292
+ self.inner_dim = (
293
+ self.config.num_attention_heads * self.config.attention_head_dim
294
+ )
295
+
296
+ self.pos_embed = EmbedND(
297
+ dim=self.inner_dim, theta=10000, axes_dim=axes_dims_rope
298
+ )
299
+ text_time_guidance_cls = (
300
+ CombinedTimestepGuidanceTextProjEmbeddings
301
+ if guidance_embeds
302
+ else CombinedTimestepTextProjEmbeddings
303
+ )
304
+ self.time_text_embed = text_time_guidance_cls(
305
+ embedding_dim=self.inner_dim,
306
+ pooled_projection_dim=self.config.pooled_projection_dim,
307
+ )
308
+
309
+ self.context_embedder = nn.Linear(
310
+ self.config.joint_attention_dim, self.inner_dim
311
+ )
312
+ self.x_embedder = torch.nn.Linear(self.config.in_channels, self.inner_dim)
313
+
314
+ self.transformer_blocks = nn.ModuleList(
315
+ [
316
+ FluxTransformerBlock(
317
+ dim=self.inner_dim,
318
+ num_attention_heads=self.config.num_attention_heads,
319
+ attention_head_dim=self.config.attention_head_dim,
320
+ )
321
+ for i in range(self.config.num_layers)
322
+ ]
323
+ )
324
+
325
+ self.single_transformer_blocks = nn.ModuleList(
326
+ [
327
+ FluxSingleTransformerBlock(
328
+ dim=self.inner_dim,
329
+ num_attention_heads=self.config.num_attention_heads,
330
+ attention_head_dim=self.config.attention_head_dim,
331
+ )
332
+ for i in range(self.config.num_single_layers)
333
+ ]
334
+ )
335
+
336
+ self.norm_out = AdaLayerNormContinuous(
337
+ self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6
338
+ )
339
+ self.proj_out = nn.Linear(
340
+ self.inner_dim, patch_size * patch_size * self.out_channels, bias=True
341
+ )
342
+
343
+ self.gradient_checkpointing = False
344
+
345
+ def _set_gradient_checkpointing(self, module, value=False):
346
+ if hasattr(module, "gradient_checkpointing"):
347
+ module.gradient_checkpointing = value
348
+
349
+ def forward(
350
+ self,
351
+ hidden_states: torch.Tensor,
352
+ encoder_hidden_states: torch.Tensor = None,
353
+ pooled_projections: torch.Tensor = None,
354
+ timestep: torch.LongTensor = None,
355
+ img_ids: torch.Tensor = None,
356
+ txt_ids: torch.Tensor = None,
357
+ guidance: torch.Tensor = None,
358
+ joint_attention_kwargs: Optional[Dict[str, Any]] = None,
359
+ controlnet_block_samples=None,
360
+ controlnet_single_block_samples=None,
361
+ return_dict: bool = True,
362
+ ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
363
+ """
364
+ The [`FluxTransformer2DModel`] forward method.
365
+
366
+ Args:
367
+ hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
368
+ Input `hidden_states`.
369
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
370
+ Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
371
+ pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
372
+ from the embeddings of input conditions.
373
+ timestep ( `torch.LongTensor`):
374
+ Used to indicate denoising step.
375
+ block_controlnet_hidden_states: (`list` of `torch.Tensor`):
376
+ A list of tensors that if specified are added to the residuals of transformer blocks.
377
+ joint_attention_kwargs (`dict`, *optional*):
378
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
379
+ `self.processor` in
380
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
381
+ return_dict (`bool`, *optional*, defaults to `True`):
382
+ Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
383
+ tuple.
384
+
385
+ Returns:
386
+ If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
387
+ `tuple` where the first element is the sample tensor.
388
+ """
389
+ if joint_attention_kwargs is not None:
390
+ joint_attention_kwargs = joint_attention_kwargs.copy()
391
+ lora_scale = joint_attention_kwargs.pop("scale", 1.0)
392
+ else:
393
+ lora_scale = 1.0
394
+
395
+ if USE_PEFT_BACKEND:
396
+ # weight the lora layers by setting `lora_scale` for each PEFT layer
397
+ scale_lora_layers(self, lora_scale)
398
+ else:
399
+ if (
400
+ joint_attention_kwargs is not None
401
+ and joint_attention_kwargs.get("scale", None) is not None
402
+ ):
403
+ logger.warning(
404
+ "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
405
+ )
406
+ hidden_states = self.x_embedder(hidden_states)
407
+
408
+ timestep = timestep.to(hidden_states.dtype) * 1000
409
+ if guidance is not None:
410
+ guidance = guidance.to(hidden_states.dtype) * 1000
411
+ else:
412
+ guidance = None
413
+ temb = (
414
+ self.time_text_embed(timestep, pooled_projections)
415
+ if guidance is None
416
+ else self.time_text_embed(timestep, guidance, pooled_projections)
417
+ )
418
+ encoder_hidden_states = self.context_embedder(encoder_hidden_states)
419
+
420
+ txt_ids = txt_ids.expand(img_ids.size(0), -1, -1)
421
+ ids = torch.cat((txt_ids, img_ids), dim=1)
422
+ image_rotary_emb = self.pos_embed(ids)
423
+
424
+ for index_block, block in enumerate(self.transformer_blocks):
425
+ if self.training and self.gradient_checkpointing:
426
+
427
+ def create_custom_forward(module, return_dict=None):
428
+ def custom_forward(*inputs):
429
+ if return_dict is not None:
430
+ return module(*inputs, return_dict=return_dict)
431
+ else:
432
+ return module(*inputs)
433
+
434
+ return custom_forward
435
+
436
+ ckpt_kwargs: Dict[str, Any] = (
437
+ {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
438
+ )
439
+ (
440
+ encoder_hidden_states,
441
+ hidden_states,
442
+ ) = torch.utils.checkpoint.checkpoint(
443
+ create_custom_forward(block),
444
+ hidden_states,
445
+ encoder_hidden_states,
446
+ temb,
447
+ image_rotary_emb,
448
+ **ckpt_kwargs,
449
+ )
450
+
451
+ else:
452
+ encoder_hidden_states, hidden_states = block(
453
+ hidden_states=hidden_states,
454
+ encoder_hidden_states=encoder_hidden_states,
455
+ temb=temb,
456
+ image_rotary_emb=image_rotary_emb,
457
+ )
458
+
459
+ # controlnet residual
460
+ if controlnet_block_samples is not None:
461
+ interval_control = len(self.transformer_blocks) / len(
462
+ controlnet_block_samples
463
+ )
464
+ interval_control = int(np.ceil(interval_control))
465
+ hidden_states = (
466
+ hidden_states
467
+ + controlnet_block_samples[index_block // interval_control]
468
+ )
469
+
470
+ hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
471
+
472
+ for index_block, block in enumerate(self.single_transformer_blocks):
473
+ if self.training and self.gradient_checkpointing:
474
+
475
+ def create_custom_forward(module, return_dict=None):
476
+ def custom_forward(*inputs):
477
+ if return_dict is not None:
478
+ return module(*inputs, return_dict=return_dict)
479
+ else:
480
+ return module(*inputs)
481
+
482
+ return custom_forward
483
+
484
+ ckpt_kwargs: Dict[str, Any] = (
485
+ {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
486
+ )
487
+ hidden_states = torch.utils.checkpoint.checkpoint(
488
+ create_custom_forward(block),
489
+ hidden_states,
490
+ temb,
491
+ image_rotary_emb,
492
+ **ckpt_kwargs,
493
+ )
494
+
495
+ else:
496
+ hidden_states = block(
497
+ hidden_states=hidden_states,
498
+ temb=temb,
499
+ image_rotary_emb=image_rotary_emb,
500
+ )
501
+
502
+ # controlnet residual
503
+ if controlnet_single_block_samples is not None:
504
+ interval_control = len(self.single_transformer_blocks) / len(
505
+ controlnet_single_block_samples
506
+ )
507
+ interval_control = int(np.ceil(interval_control))
508
+ hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
509
+ hidden_states[:, encoder_hidden_states.shape[1] :, ...]
510
+ + controlnet_single_block_samples[index_block // interval_control]
511
+ )
512
+
513
+ hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
514
+
515
+ hidden_states = self.norm_out(hidden_states, temb)
516
+ output = self.proj_out(hidden_states)
517
+
518
+ if USE_PEFT_BACKEND:
519
+ # remove `lora_scale` from each PEFT layer
520
+ unscale_lora_layers(self, lora_scale)
521
+
522
+ if not return_dict:
523
+ return (output,)
524
+
525
+ return Transformer2DModelOutput(sample=output)