Spaces:

FQiao
/

SoundingStreet

Running on Zero

App Files Files Community

SoundingStreet / external_models /TangoFlux /comfyui /teacache.py

FQiao

Upload 70 files

3324de2 verified 10 days ago

raw

history blame contribute delete

11.2 kB

	# Code from https://github.com/ali-vilab/TeaCache/blob/main/TeaCache4TangoFlux/teacache_tango_flux.py

	from typing import Any, Dict, Optional, Union
	from diffusers.models.modeling_outputs import Transformer2DModelOutput
	from diffusers.utils import (
	USE_PEFT_BACKEND,
	is_torch_version,
	logging,
	scale_lora_layers,
	unscale_lora_layers,
	)
	import torch
	import numpy as np


	logger = logging.get_logger(__name__) # pylint: disable=invalid-name


	def teacache_forward(
	self,
	hidden_states: torch.Tensor,
	encoder_hidden_states: torch.Tensor = None,
	pooled_projections: torch.Tensor = None,
	timestep: torch.LongTensor = None,
	img_ids: torch.Tensor = None,
	txt_ids: torch.Tensor = None,
	guidance: torch.Tensor = None,
	joint_attention_kwargs: Optional[Dict[str, Any]] = None,
	return_dict: bool = True,
	) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
	"""
	The [`FluxTransformer2DModel`] forward method.

	Args:
	hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
	Input `hidden_states`.
	encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
	Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
	pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
	from the embeddings of input conditions.
	timestep ( `torch.LongTensor`):
	Used to indicate denoising step.
	block_controlnet_hidden_states: (`list` of `torch.Tensor`):
	A list of tensors that if specified are added to the residuals of transformer blocks.
	joint_attention_kwargs (`dict`, optional):
	A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
	`self.processor` in
	[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
	tuple.

	Returns:
	If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
	`tuple` where the first element is the sample tensor.
	"""
	if joint_attention_kwargs is not None:
	joint_attention_kwargs = joint_attention_kwargs.copy()
	lora_scale = joint_attention_kwargs.pop("scale", 1.0)
	else:
	lora_scale = 1.0

	if USE_PEFT_BACKEND:
	# weight the lora layers by setting `lora_scale` for each PEFT layer
	scale_lora_layers(self, lora_scale)
	else:
	if (
	joint_attention_kwargs is not None
	and joint_attention_kwargs.get("scale", None) is not None
	):
	logger.warning(
	"Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
	)
	hidden_states = self.x_embedder(hidden_states)

	timestep = timestep.to(hidden_states.dtype) * 1000
	if guidance is not None:
	guidance = guidance.to(hidden_states.dtype) * 1000
	else:
	guidance = None
	temb = (
	self.time_text_embed(timestep, pooled_projections)
	if guidance is None
	else self.time_text_embed(timestep, guidance, pooled_projections)
	)
	encoder_hidden_states = self.context_embedder(encoder_hidden_states)

	ids = torch.cat((txt_ids, img_ids), dim=1)
	image_rotary_emb = self.pos_embed(ids)

	if self.enable_teacache:
	inp = hidden_states.clone()
	temb_ = temb.clone()
	modulated_inp, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
	self.transformer_blocks[0].norm1(inp, emb=temb_)
	)
	if self.cnt == 0 or self.cnt == self.num_steps - 1:
	should_calc = True
	self.accumulated_rel_l1_distance = 0
	else:
	coefficients = [
	4.98651651e02,
	-2.83781631e02,
	5.58554382e01,
	-3.82021401e00,
	2.64230861e-01,
	]
	rescale_func = np.poly1d(coefficients)
	self.accumulated_rel_l1_distance += rescale_func(
	(
	(modulated_inp - self.previous_modulated_input).abs().mean()
	/ self.previous_modulated_input.abs().mean()
	)
	.cpu()
	.item()
	)
	if self.accumulated_rel_l1_distance < self.rel_l1_thresh:
	should_calc = False
	else:
	should_calc = True
	self.accumulated_rel_l1_distance = 0
	self.previous_modulated_input = modulated_inp
	self.cnt += 1
	if self.cnt == self.num_steps:
	self.cnt = 0

	if self.enable_teacache:
	if not should_calc:
	hidden_states += self.previous_residual
	else:
	ori_hidden_states = hidden_states.clone()
	for index_block, block in enumerate(self.transformer_blocks):
	if self.training and self.gradient_checkpointing:

	def create_custom_forward(module, return_dict=None):
	def custom_forward(*inputs):
	if return_dict is not None:
	return module(*inputs, return_dict=return_dict)
	else:
	return module(*inputs)

	return custom_forward

	ckpt_kwargs: Dict[str, Any] = (
	{"use_reentrant": False}
	if is_torch_version(">=", "1.11.0")
	else {}
	)
	encoder_hidden_states, hidden_states = (
	torch.utils.checkpoint.checkpoint(
	create_custom_forward(block),
	hidden_states,
	encoder_hidden_states,
	temb,
	image_rotary_emb,
	**ckpt_kwargs,
	)
	)

	else:
	encoder_hidden_states, hidden_states = block(
	hidden_states=hidden_states,
	encoder_hidden_states=encoder_hidden_states,
	temb=temb,
	image_rotary_emb=image_rotary_emb,
	)

	hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)

	for index_block, block in enumerate(self.single_transformer_blocks):
	if self.training and self.gradient_checkpointing:

	def create_custom_forward(module, return_dict=None):
	def custom_forward(*inputs):
	if return_dict is not None:
	return module(*inputs, return_dict=return_dict)
	else:
	return module(*inputs)

	return custom_forward

	ckpt_kwargs: Dict[str, Any] = (
	{"use_reentrant": False}
	if is_torch_version(">=", "1.11.0")
	else {}
	)
	hidden_states = torch.utils.checkpoint.checkpoint(
	create_custom_forward(block),
	hidden_states,
	temb,
	image_rotary_emb,
	**ckpt_kwargs,
	)

	else:
	hidden_states = block(
	hidden_states=hidden_states,
	temb=temb,
	image_rotary_emb=image_rotary_emb,
	)

	hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
	self.previous_residual = hidden_states - ori_hidden_states
	else:
	for index_block, block in enumerate(self.transformer_blocks):
	if self.training and self.gradient_checkpointing:

	def create_custom_forward(module, return_dict=None):
	def custom_forward(*inputs):
	if return_dict is not None:
	return module(*inputs, return_dict=return_dict)
	else:
	return module(*inputs)

	return custom_forward

	ckpt_kwargs: Dict[str, Any] = (
	{"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
	)
	encoder_hidden_states, hidden_states = (
	torch.utils.checkpoint.checkpoint(
	create_custom_forward(block),
	hidden_states,
	encoder_hidden_states,
	temb,
	image_rotary_emb,
	**ckpt_kwargs,
	)
	)

	else:
	encoder_hidden_states, hidden_states = block(
	hidden_states=hidden_states,
	encoder_hidden_states=encoder_hidden_states,
	temb=temb,
	image_rotary_emb=image_rotary_emb,
	)

	hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)

	for index_block, block in enumerate(self.single_transformer_blocks):
	if self.training and self.gradient_checkpointing:

	def create_custom_forward(module, return_dict=None):
	def custom_forward(*inputs):
	if return_dict is not None:
	return module(*inputs, return_dict=return_dict)
	else:
	return module(*inputs)

	return custom_forward

	ckpt_kwargs: Dict[str, Any] = (
	{"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
	)
	hidden_states = torch.utils.checkpoint.checkpoint(
	create_custom_forward(block),
	hidden_states,
	temb,
	image_rotary_emb,
	**ckpt_kwargs,
	)

	else:
	hidden_states = block(
	hidden_states=hidden_states,
	temb=temb,
	image_rotary_emb=image_rotary_emb,
	)

	hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]

	hidden_states = self.norm_out(hidden_states, temb)
	output = self.proj_out(hidden_states)

	if USE_PEFT_BACKEND:
	# remove `lora_scale` from each PEFT layer
	unscale_lora_layers(self, lora_scale)

	if not return_dict:
	return (output,)

	return Transformer2DModelOutput(sample=output)