Spaces:

ramimu
/

LoRa_Streamlit

Paused

File size: 67,419 Bytes

1c72248

import math
import torch
import sys

from PIL import Image
from torch.nn import Parameter
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection, T5EncoderModel, CLIPTextModel, \
    CLIPTokenizer, T5Tokenizer

from toolkit.data_transfer_object.data_loader import DataLoaderBatchDTO
from toolkit.models.clip_fusion import CLIPFusionModule
from toolkit.models.clip_pre_processor import CLIPImagePreProcessor
from toolkit.models.control_lora_adapter import ControlLoraAdapter
from toolkit.models.i2v_adapter import I2VAdapter
from toolkit.models.subpixel_adapter import SubpixelAdapter
from toolkit.models.ilora import InstantLoRAModule
from toolkit.models.single_value_adapter import SingleValueAdapter
from toolkit.models.te_adapter import TEAdapter
from toolkit.models.te_aug_adapter import TEAugAdapter
from toolkit.models.vd_adapter import VisionDirectAdapter
from toolkit.models.redux import ReduxImageEncoder
from toolkit.photomaker import PhotoMakerIDEncoder, FuseModule, PhotoMakerCLIPEncoder
from toolkit.saving import load_ip_adapter_model, load_custom_adapter_model
from toolkit.train_tools import get_torch_dtype
from toolkit.models.pixtral_vision import PixtralVisionEncoderCompatible, PixtralVisionImagePreprocessorCompatible
import random
from toolkit.util.mask import generate_random_mask
from typing import TYPE_CHECKING, Union, Iterator, Mapping, Any, Tuple, List, Optional, Dict
from collections import OrderedDict
from toolkit.config_modules import AdapterConfig, AdapterTypes, TrainConfig
from toolkit.prompt_utils import PromptEmbeds
import weakref

if TYPE_CHECKING:
    from toolkit.stable_diffusion_model import StableDiffusion

from transformers import (
    CLIPImageProcessor,
    CLIPVisionModelWithProjection,
    CLIPVisionModel,
    AutoImageProcessor,
    ConvNextModel,
    ConvNextForImageClassification,
    ConvNextImageProcessor,
    UMT5EncoderModel, LlamaTokenizerFast, AutoModel, AutoTokenizer, BitsAndBytesConfig
)
from toolkit.models.size_agnostic_feature_encoder import SAFEImageProcessor, SAFEVisionModel

from transformers import ViTHybridImageProcessor, ViTHybridForImageClassification

from transformers import ViTFeatureExtractor, ViTForImageClassification

from toolkit.models.llm_adapter import LLMAdapter

import torch.nn.functional as F


class CustomAdapter(torch.nn.Module):
    def __init__(self, sd: 'StableDiffusion', adapter_config: 'AdapterConfig', train_config: 'TrainConfig'):
        super().__init__()
        self.config = adapter_config
        self.sd_ref: weakref.ref = weakref.ref(sd)
        self.train_config = train_config
        self.device = self.sd_ref().unet.device
        self.image_processor: CLIPImageProcessor = None
        self.input_size = 224
        self.adapter_type: AdapterTypes = self.config.type
        self.current_scale = 1.0
        self.is_active = True
        self.flag_word = "fla9wor0"
        self.is_unconditional_run = False
        self.is_sampling = False

        self.vision_encoder: Union[PhotoMakerCLIPEncoder, CLIPVisionModelWithProjection] = None

        self.fuse_module: FuseModule = None

        self.lora: None = None

        self.position_ids: Optional[List[int]] = None

        self.num_control_images = self.config.num_control_images
        self.token_mask: Optional[torch.Tensor] = None

        # setup clip
        self.setup_clip()
        # add for dataloader
        self.clip_image_processor = self.image_processor

        self.clip_fusion_module: CLIPFusionModule = None
        self.ilora_module: InstantLoRAModule = None

        self.te: Union[T5EncoderModel, CLIPTextModel] = None
        self.tokenizer: CLIPTokenizer = None
        self.te_adapter: TEAdapter = None
        self.te_augmenter: TEAugAdapter = None
        self.vd_adapter: VisionDirectAdapter = None
        self.single_value_adapter: SingleValueAdapter = None
        self.redux_adapter: ReduxImageEncoder = None
        self.control_lora: ControlLoraAdapter = None
        self.subpixel_adapter: SubpixelAdapter = None
        self.i2v_adapter: I2VAdapter = None
        
        self.conditional_embeds: Optional[torch.Tensor] = None
        self.unconditional_embeds: Optional[torch.Tensor] = None
        
        self.cached_control_image_0_1: Optional[torch.Tensor] = None

        self.setup_adapter()

        if self.adapter_type == 'photo_maker':
            # try to load from our name_or_path
            if self.config.name_or_path is not None and self.config.name_or_path.endswith('.bin'):
                self.load_state_dict(torch.load(self.config.name_or_path, map_location=self.device), strict=False)
            # add the trigger word to the tokenizer
            if isinstance(self.sd_ref().tokenizer, list):
                for tokenizer in self.sd_ref().tokenizer:
                    tokenizer.add_tokens([self.flag_word], special_tokens=True)
            else:
                self.sd_ref().tokenizer.add_tokens([self.flag_word], special_tokens=True)
        elif self.config.name_or_path is not None:
            loaded_state_dict = load_custom_adapter_model(
                self.config.name_or_path,
                self.sd_ref().device,
                dtype=self.sd_ref().dtype,
            )
            self.load_state_dict(loaded_state_dict, strict=False)

    def setup_adapter(self):
        torch_dtype = get_torch_dtype(self.sd_ref().dtype)
        if self.adapter_type == 'photo_maker':
            sd = self.sd_ref()
            embed_dim = sd.unet_unwrapped.config['cross_attention_dim']
            self.fuse_module = FuseModule(embed_dim)
        elif self.adapter_type == 'clip_fusion':
            sd = self.sd_ref()
            embed_dim = sd.unet_unwrapped.config['cross_attention_dim']

            vision_tokens = ((self.vision_encoder.config.image_size // self.vision_encoder.config.patch_size) ** 2)
            if self.config.image_encoder_arch == 'clip':
                vision_tokens = vision_tokens + 1
            self.clip_fusion_module = CLIPFusionModule(
                text_hidden_size=embed_dim,
                text_tokens=77,
                vision_hidden_size=self.vision_encoder.config.hidden_size,
                vision_tokens=vision_tokens
            )
        elif self.adapter_type == 'ilora':
            vision_tokens = ((self.vision_encoder.config.image_size // self.vision_encoder.config.patch_size) ** 2)
            if self.config.image_encoder_arch == 'clip':
                vision_tokens = vision_tokens + 1

            vision_hidden_size = self.vision_encoder.config.hidden_size

            if self.config.clip_layer == 'image_embeds':
                vision_tokens = 1
                vision_hidden_size = self.vision_encoder.config.projection_dim

            self.ilora_module = InstantLoRAModule(
                vision_tokens=vision_tokens,
                vision_hidden_size=vision_hidden_size,
                head_dim=self.config.head_dim,
                num_heads=self.config.num_heads,
                sd=self.sd_ref(),
                config=self.config
            )
        elif self.adapter_type == 'text_encoder':
            if self.config.text_encoder_arch == 't5':
                te_kwargs = {}
                # te_kwargs['load_in_4bit'] = True
                # te_kwargs['load_in_8bit'] = True
                te_kwargs['device_map'] = "auto"
                te_is_quantized = True

                self.te = T5EncoderModel.from_pretrained(
                    self.config.text_encoder_path,
                    torch_dtype=torch_dtype,
                    **te_kwargs
                )

                # self.te.to = lambda *args, **kwargs: None
                self.tokenizer = T5Tokenizer.from_pretrained(self.config.text_encoder_path)
            elif self.config.text_encoder_arch == 'pile-t5':
                te_kwargs = {}
                # te_kwargs['load_in_4bit'] = True
                # te_kwargs['load_in_8bit'] = True
                te_kwargs['device_map'] = "auto"
                te_is_quantized = True

                self.te = UMT5EncoderModel.from_pretrained(
                    self.config.text_encoder_path,
                    torch_dtype=torch_dtype,
                    **te_kwargs
                )

                # self.te.to = lambda *args, **kwargs: None
                self.tokenizer = LlamaTokenizerFast.from_pretrained(self.config.text_encoder_path)
                if self.tokenizer.pad_token is None:
                    self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
            elif self.config.text_encoder_arch == 'clip':
                self.te = CLIPTextModel.from_pretrained(self.config.text_encoder_path).to(self.sd_ref().unet.device,
                                                                                          dtype=torch_dtype)
                self.tokenizer = CLIPTokenizer.from_pretrained(self.config.text_encoder_path)
            else:
                raise ValueError(f"unknown text encoder arch: {self.config.text_encoder_arch}")

            self.te_adapter = TEAdapter(self, self.sd_ref(), self.te, self.tokenizer)
        elif self.adapter_type == 'llm_adapter':
            kwargs = {}
            if self.config.quantize_llm:
                bnb_kwargs = {
                    'load_in_4bit': True,
                    'bnb_4bit_quant_type': "nf4",
                    'bnb_4bit_compute_dtype': torch.bfloat16
                }
                quantization_config = BitsAndBytesConfig(**bnb_kwargs)
                kwargs['quantization_config'] = quantization_config
                kwargs['torch_dtype'] = torch_dtype
                self.te = AutoModel.from_pretrained(
                    self.config.text_encoder_path,
                    **kwargs
                )
            else:
                self.te = AutoModel.from_pretrained(self.config.text_encoder_path).to(
                    self.sd_ref().unet.device, 
                    dtype=torch_dtype,
                )
            self.te.to = lambda *args, **kwargs: None
            self.te.eval()
            self.tokenizer = AutoTokenizer.from_pretrained(self.config.text_encoder_path)
            self.llm_adapter = LLMAdapter(
                adapter=self, 
                sd=self.sd_ref(),
                llm=self.te,
                tokenizer=self.tokenizer,
                num_cloned_blocks=self.config.num_cloned_blocks,
            )
            self.llm_adapter.to(self.device, torch_dtype)
        elif self.adapter_type == 'te_augmenter':
            self.te_augmenter = TEAugAdapter(self, self.sd_ref())
        elif self.adapter_type == 'vision_direct':
            self.vd_adapter = VisionDirectAdapter(self, self.sd_ref(), self.vision_encoder)
        elif self.adapter_type == 'single_value':
            self.single_value_adapter = SingleValueAdapter(self, self.sd_ref(), num_values=self.config.num_tokens)
        elif self.adapter_type == 'redux':
            vision_hidden_size = self.vision_encoder.config.hidden_size
            self.redux_adapter = ReduxImageEncoder(vision_hidden_size, 4096, self.device, torch_dtype)
        elif self.adapter_type == 'control_lora':
            self.control_lora = ControlLoraAdapter(
                self,
                sd=self.sd_ref(),
                config=self.config,
                train_config=self.train_config
            )
        elif self.adapter_type == 'i2v':
            self.i2v_adapter = I2VAdapter(
                self,
                sd=self.sd_ref(),
                config=self.config,
                train_config=self.train_config,
                image_processor=self.image_processor,
                vision_encoder=self.vision_encoder,
            )
        elif self.adapter_type == 'subpixel':
            self.subpixel_adapter = SubpixelAdapter(
                self,
                sd=self.sd_ref(),
                config=self.config,
                train_config=self.train_config
            )
        else:
            raise ValueError(f"unknown adapter type: {self.adapter_type}")

    def forward(self, *args, **kwargs):
        # dont think this is used
        # if self.adapter_type == 'photo_maker':
        #     id_pixel_values = args[0]
        #     prompt_embeds: PromptEmbeds = args[1]
        #     class_tokens_mask = args[2]
        #
        #     grads_on_image_encoder = self.config.train_image_encoder and torch.is_grad_enabled()
        #
        #     with torch.set_grad_enabled(grads_on_image_encoder):
        #         id_embeds = self.vision_encoder(self, id_pixel_values, do_projection2=False)
        #
        #     if not grads_on_image_encoder:
        #         id_embeds = id_embeds.detach()
        #
        #     prompt_embeds = prompt_embeds.detach()
        #
        #     updated_prompt_embeds = self.fuse_module(
        #         prompt_embeds, id_embeds, class_tokens_mask
        #     )
        #
        #     return updated_prompt_embeds
        # else:
        raise NotImplementedError

    def edit_batch_raw(self, batch: DataLoaderBatchDTO):
        # happens on a raw batch before latents are created
        return batch
    
    def edit_batch_processed(self, batch: DataLoaderBatchDTO):
        # happens after the latents are processed
        if self.adapter_type == "i2v":
            return self.i2v_adapter.edit_batch_processed(batch)
        return batch

    def setup_clip(self):
        adapter_config = self.config
        sd = self.sd_ref()
        if self.config.type in ["text_encoder", "llm_adapter", "single_value", "control_lora", "subpixel"]:
            return
        if self.config.type == 'photo_maker':
            try:
                self.image_processor = CLIPImageProcessor.from_pretrained(self.config.image_encoder_path)
            except EnvironmentError:
                self.image_processor = CLIPImageProcessor()
            if self.config.image_encoder_path is None:
                self.vision_encoder = PhotoMakerCLIPEncoder()
            else:
                self.vision_encoder = PhotoMakerCLIPEncoder.from_pretrained(self.config.image_encoder_path)
        elif self.config.image_encoder_arch == 'clip' or self.config.image_encoder_arch == 'clip+':
            try:
                self.image_processor = CLIPImageProcessor.from_pretrained(adapter_config.image_encoder_path)
            except EnvironmentError:
                self.image_processor = CLIPImageProcessor()
            self.vision_encoder = CLIPVisionModelWithProjection.from_pretrained(
                adapter_config.image_encoder_path,
                ignore_mismatched_sizes=True).to(self.device, dtype=get_torch_dtype(self.sd_ref().dtype))
        elif self.config.image_encoder_arch == 'siglip':
            from transformers import SiglipImageProcessor, SiglipVisionModel
            try:
                self.image_processor = SiglipImageProcessor.from_pretrained(adapter_config.image_encoder_path)
            except EnvironmentError:
                self.image_processor = SiglipImageProcessor()
            self.vision_encoder = SiglipVisionModel.from_pretrained(
                adapter_config.image_encoder_path,
                ignore_mismatched_sizes=True).to(self.device, dtype=get_torch_dtype(self.sd_ref().dtype))
        elif self.config.image_encoder_arch == 'siglip2':
            from transformers import SiglipImageProcessor, SiglipVisionModel
            try:
                self.image_processor = SiglipImageProcessor.from_pretrained(adapter_config.image_encoder_path)
            except EnvironmentError:
                self.image_processor = SiglipImageProcessor()
            self.vision_encoder = SiglipVisionModel.from_pretrained(
                adapter_config.image_encoder_path,
                ignore_mismatched_sizes=True).to(self.device, dtype=get_torch_dtype(self.sd_ref().dtype))
        elif self.config.image_encoder_arch == 'pixtral':
            self.image_processor = PixtralVisionImagePreprocessorCompatible(
                max_image_size=self.config.pixtral_max_image_size,
            )
            self.vision_encoder = PixtralVisionEncoderCompatible.from_pretrained(
                adapter_config.image_encoder_path,
            ).to(self.device, dtype=get_torch_dtype(self.sd_ref().dtype))
        elif self.config.image_encoder_arch == 'vit':
            try:
                self.image_processor = ViTFeatureExtractor.from_pretrained(adapter_config.image_encoder_path)
            except EnvironmentError:
                self.image_processor = ViTFeatureExtractor()
            self.vision_encoder = ViTForImageClassification.from_pretrained(adapter_config.image_encoder_path).to(
                self.device, dtype=get_torch_dtype(self.sd_ref().dtype))
        elif self.config.image_encoder_arch == 'safe':
            try:
                self.image_processor = SAFEImageProcessor.from_pretrained(adapter_config.image_encoder_path)
            except EnvironmentError:
                self.image_processor = SAFEImageProcessor()
            self.vision_encoder = SAFEVisionModel(
                in_channels=3,
                num_tokens=self.config.safe_tokens,
                num_vectors=sd.unet_unwrapped.config['cross_attention_dim'],
                reducer_channels=self.config.safe_reducer_channels,
                channels=self.config.safe_channels,
                downscale_factor=8
            ).to(self.device, dtype=get_torch_dtype(self.sd_ref().dtype))
        elif self.config.image_encoder_arch == 'convnext':
            try:
                self.image_processor = ConvNextImageProcessor.from_pretrained(adapter_config.image_encoder_path)
            except EnvironmentError:
                print(f"could not load image processor from {adapter_config.image_encoder_path}")
                self.image_processor = ConvNextImageProcessor(
                    size=320,
                    image_mean=[0.48145466, 0.4578275, 0.40821073],
                    image_std=[0.26862954, 0.26130258, 0.27577711],
                )
            self.vision_encoder = ConvNextForImageClassification.from_pretrained(
                adapter_config.image_encoder_path,
                use_safetensors=True,
            ).to(self.device, dtype=get_torch_dtype(self.sd_ref().dtype))
        elif self.config.image_encoder_arch == 'vit-hybrid':
            try:
                self.image_processor = ViTHybridImageProcessor.from_pretrained(adapter_config.image_encoder_path)
            except EnvironmentError:
                print(f"could not load image processor from {adapter_config.image_encoder_path}")
                self.image_processor = ViTHybridImageProcessor(
                    size=320,
                    image_mean=[0.48145466, 0.4578275, 0.40821073],
                    image_std=[0.26862954, 0.26130258, 0.27577711],
                )
            self.vision_encoder = ViTHybridForImageClassification.from_pretrained(
                adapter_config.image_encoder_path,
                use_safetensors=True,
            ).to(self.device, dtype=get_torch_dtype(self.sd_ref().dtype))
        else:
            raise ValueError(f"unknown image encoder arch: {adapter_config.image_encoder_arch}")

        self.input_size = self.vision_encoder.config.image_size

        if self.config.quad_image:  # 4x4 image
            # self.clip_image_processor.config
            # We do a 3x downscale of the image, so we need to adjust the input size
            preprocessor_input_size = self.vision_encoder.config.image_size * 2

            # update the preprocessor so images come in at the right size
            if 'height' in self.image_processor.size:
                self.image_processor.size['height'] = preprocessor_input_size
                self.image_processor.size['width'] = preprocessor_input_size
            elif hasattr(self.image_processor, 'crop_size'):
                self.image_processor.size['shortest_edge'] = preprocessor_input_size
                self.image_processor.crop_size['height'] = preprocessor_input_size
                self.image_processor.crop_size['width'] = preprocessor_input_size

        if self.config.image_encoder_arch == 'clip+':
            # self.image_processor.config
            # We do a 3x downscale of the image, so we need to adjust the input size
            preprocessor_input_size = self.vision_encoder.config.image_size * 4

            # update the preprocessor so images come in at the right size
            self.image_processor.size['shortest_edge'] = preprocessor_input_size
            self.image_processor.crop_size['height'] = preprocessor_input_size
            self.image_processor.crop_size['width'] = preprocessor_input_size

            self.preprocessor = CLIPImagePreProcessor(
                input_size=preprocessor_input_size,
                clip_input_size=self.vision_encoder.config.image_size,
            )
        if 'height' in self.image_processor.size:
            self.input_size = self.image_processor.size['height']
        else:
            self.input_size = self.image_processor.crop_size['height']

    def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
        strict = False
        if self.config.train_only_image_encoder and 'vd_adapter' not in state_dict and 'dvadapter' not in state_dict:
            # we are loading pure clip weights.
            self.vision_encoder.load_state_dict(state_dict, strict=strict)

        if 'lora_weights' in state_dict:
            # todo add LoRA
            # self.sd_ref().pipeline.load_lora_weights(state_dict["lora_weights"], adapter_name="photomaker")
            # self.sd_ref().pipeline.fuse_lora()
            pass
        if 'clip_fusion' in state_dict:
            self.clip_fusion_module.load_state_dict(state_dict['clip_fusion'], strict=strict)
        if 'id_encoder' in state_dict and (self.adapter_type == 'photo_maker' or self.adapter_type == 'clip_fusion'):
            self.vision_encoder.load_state_dict(state_dict['id_encoder'], strict=strict)
            # check to see if the fuse weights are there
            fuse_weights = {}
            for k, v in state_dict['id_encoder'].items():
                if k.startswith('fuse_module'):
                    k = k.replace('fuse_module.', '')
                    fuse_weights[k] = v
            if len(fuse_weights) > 0:
                try:
                    self.fuse_module.load_state_dict(fuse_weights, strict=strict)
                except Exception as e:

                    print(e)
                    # force load it
                    print(f"force loading fuse module as it did not match")
                    current_state_dict = self.fuse_module.state_dict()
                    for k, v in fuse_weights.items():
                        if len(v.shape) == 1:
                            current_state_dict[k] = v[:current_state_dict[k].shape[0]]
                        elif len(v.shape) == 2:
                            current_state_dict[k] = v[:current_state_dict[k].shape[0], :current_state_dict[k].shape[1]]
                        elif len(v.shape) == 3:
                            current_state_dict[k] = v[:current_state_dict[k].shape[0], :current_state_dict[k].shape[1],
                                                    :current_state_dict[k].shape[2]]
                        elif len(v.shape) == 4:
                            current_state_dict[k] = v[:current_state_dict[k].shape[0], :current_state_dict[k].shape[1],
                                                    :current_state_dict[k].shape[2], :current_state_dict[k].shape[3]]
                        else:
                            raise ValueError(f"unknown shape: {v.shape}")
                    self.fuse_module.load_state_dict(current_state_dict, strict=strict)

        if 'te_adapter' in state_dict:
            self.te_adapter.load_state_dict(state_dict['te_adapter'], strict=strict)
            
        if 'llm_adapter' in state_dict:
            self.llm_adapter.load_state_dict(state_dict['llm_adapter'], strict=strict)

        if 'te_augmenter' in state_dict:
            self.te_augmenter.load_state_dict(state_dict['te_augmenter'], strict=strict)

        if 'vd_adapter' in state_dict:
            self.vd_adapter.load_state_dict(state_dict['vd_adapter'], strict=strict)
        if 'dvadapter' in state_dict:
            self.vd_adapter.load_state_dict(state_dict['dvadapter'], strict=False)

        if 'sv_adapter' in state_dict:
            self.single_value_adapter.load_state_dict(state_dict['sv_adapter'], strict=strict)

        if 'vision_encoder' in state_dict:
            self.vision_encoder.load_state_dict(state_dict['vision_encoder'], strict=strict)

        if 'fuse_module' in state_dict:
            self.fuse_module.load_state_dict(state_dict['fuse_module'], strict=strict)

        if 'ilora' in state_dict:
            try:
                self.ilora_module.load_state_dict(state_dict['ilora'], strict=strict)
            except Exception as e:
                print(e)
        if 'redux_up' in state_dict:
            # state dict is seperated. so recombine it
            new_dict = {}
            for k, v in state_dict.items():
                for k2, v2 in v.items():
                    new_dict[k + '.' + k2] = v2
            self.redux_adapter.load_state_dict(new_dict, strict=True)
        
        if self.adapter_type == 'control_lora':
            # state dict is seperated. so recombine it
            new_dict = {}
            for k, v in state_dict.items():
                for k2, v2 in v.items():
                    new_dict[k + '.' + k2] = v2
            self.control_lora.load_weights(new_dict, strict=strict)
        
        if self.adapter_type == 'i2v':
            # state dict is seperated. so recombine it
            new_dict = {}
            for k, v in state_dict.items():
                for k2, v2 in v.items():
                    new_dict[k + '.' + k2] = v2
            self.i2v_adapter.load_weights(new_dict, strict=strict)
        
        if self.adapter_type == 'subpixel':
            # state dict is seperated. so recombine it
            new_dict = {}
            for k, v in state_dict.items():
                for k2, v2 in v.items():
                    new_dict[k + '.' + k2] = v2
            self.subpixel_adapter.load_weights(new_dict, strict=strict)

        pass

    def state_dict(self) -> OrderedDict:
        state_dict = OrderedDict()
        if self.config.train_only_image_encoder:
            return self.vision_encoder.state_dict()

        if self.adapter_type == 'photo_maker':
            if self.config.train_image_encoder:
                state_dict["id_encoder"] = self.vision_encoder.state_dict()

            state_dict["fuse_module"] = self.fuse_module.state_dict()

            # todo save LoRA
            return state_dict

        elif self.adapter_type == 'clip_fusion':
            if self.config.train_image_encoder:
                state_dict["vision_encoder"] = self.vision_encoder.state_dict()
            state_dict["clip_fusion"] = self.clip_fusion_module.state_dict()
            return state_dict
        elif self.adapter_type == 'text_encoder':
            state_dict["te_adapter"] = self.te_adapter.state_dict()
            return state_dict
        elif self.adapter_type == 'llm_adapter':
            state_dict["llm_adapter"] = self.llm_adapter.state_dict()
            return state_dict
        elif self.adapter_type == 'te_augmenter':
            if self.config.train_image_encoder:
                state_dict["vision_encoder"] = self.vision_encoder.state_dict()
            state_dict["te_augmenter"] = self.te_augmenter.state_dict()
            return state_dict
        elif self.adapter_type == 'vision_direct':
            state_dict["dvadapter"] = self.vd_adapter.state_dict()
            # if self.config.train_image_encoder: # always return vision encoder
            state_dict["vision_encoder"] = self.vision_encoder.state_dict()
            return state_dict
        elif self.adapter_type == 'single_value':
            state_dict["sv_adapter"] = self.single_value_adapter.state_dict()
            return state_dict
        elif self.adapter_type == 'ilora':
            if self.config.train_image_encoder:
                state_dict["vision_encoder"] = self.vision_encoder.state_dict()
            state_dict["ilora"] = self.ilora_module.state_dict()
            return state_dict
        elif self.adapter_type == 'redux':
            d = self.redux_adapter.state_dict()
            for k, v in d.items():
                state_dict[k] = v
            return state_dict
        elif self.adapter_type == 'control_lora':
            d = self.control_lora.get_state_dict()
            for k, v in d.items():
                state_dict[k] = v
            return state_dict
        elif self.adapter_type == 'i2v':
            d = self.i2v_adapter.get_state_dict()
            for k, v in d.items():
                state_dict[k] = v
            return state_dict
        elif self.adapter_type == 'subpixel':
            d = self.subpixel_adapter.get_state_dict()
            for k, v in d.items():
                state_dict[k] = v
            return state_dict
        else:
            raise NotImplementedError

    def add_extra_values(self, extra_values: torch.Tensor, is_unconditional=False):
        if self.adapter_type == 'single_value':
            if is_unconditional:
                self.unconditional_embeds = extra_values.to(self.device, get_torch_dtype(self.sd_ref().dtype))
            else:
                self.conditional_embeds = extra_values.to(self.device, get_torch_dtype(self.sd_ref().dtype))
    
    def condition_noisy_latents(self, latents: torch.Tensor, batch:DataLoaderBatchDTO):
        with torch.no_grad():
            # todo add i2v start frame conditioning here
            
            if self.adapter_type in ['i2v']:
                return self.i2v_adapter.condition_noisy_latents(latents, batch)
            elif self.adapter_type in ['control_lora']:
                # inpainting input is 0-1 (bs, 4, h, w) on batch.inpaint_tensor
                # 4th channel is the mask with 1 being keep area and 0 being area to inpaint.
                sd: StableDiffusion = self.sd_ref()
                inpainting_latent = None
                if self.config.has_inpainting_input:
                    do_dropout = random.random() < self.config.control_image_dropout
                    # do random mask if we dont have one
                    inpaint_tensor = batch.inpaint_tensor
                    if inpaint_tensor is None and not do_dropout:
                        # generate a random one since we dont have one
                        # this will make random blobs, invert the blobs for now as we normanlly inpaint the alpha
                        inpaint_tensor = 1 - generate_random_mask(
                            batch_size=latents.shape[0],
                            height=latents.shape[2],
                            width=latents.shape[3],
                            device=latents.device,
                        ).to(latents.device, latents.dtype)
                    if inpaint_tensor is not None and not do_dropout:
                        
                        if inpaint_tensor.shape[1] == 4:
                            # get just the mask
                            inpainting_tensor_mask = inpaint_tensor[:, 3:4, :, :].to(latents.device, dtype=latents.dtype)
                        elif inpaint_tensor.shape[1] == 3:
                            # rgb mask. Just get one channel
                            inpainting_tensor_mask = inpaint_tensor[:, 0:1, :, :].to(latents.device, dtype=latents.dtype)
                        else:
                            inpainting_tensor_mask = inpaint_tensor
                        
                        # # use our batch latents so we cna avoid ancoding again
                        inpainting_latent = batch.latents
                        
                        # resize the mask to match the new encoded size
                        inpainting_tensor_mask = F.interpolate(inpainting_tensor_mask, size=(inpainting_latent.shape[2], inpainting_latent.shape[3]), mode='bilinear')
                        inpainting_tensor_mask = inpainting_tensor_mask.to(latents.device, latents.dtype)
                        
                        do_mask_invert = False
                        if self.config.invert_inpaint_mask_chance > 0.0:
                            do_mask_invert = random.random() < self.config.invert_inpaint_mask_chance
                        if do_mask_invert:
                            # invert the mask
                            inpainting_tensor_mask = 1 - inpainting_tensor_mask
                        
                        # mask out the inpainting area, it is currently 0 for inpaint area, and 1 for keep area
                        # we are zeroing our the latents in the inpaint area not on the pixel space.
                        inpainting_latent = inpainting_latent * inpainting_tensor_mask
                        
                        # mask needs to be 1 for inpaint area and 0 for area to leave alone. So flip it.
                        inpainting_tensor_mask = 1 - inpainting_tensor_mask
                        # leave the mask as 0-1 and concat on channel of latents
                        inpainting_latent = torch.cat((inpainting_latent, inpainting_tensor_mask), dim=1)
                    else:
                        # we have iinpainting but didnt get a control. or we are doing a dropout
                        # the input needs to be all zeros for the latents and all 1s for the mask
                        inpainting_latent = torch.zeros_like(latents)
                        # add ones for the mask since we are technically inpainting everything
                        inpainting_latent = torch.cat((inpainting_latent, torch.ones_like(inpainting_latent[:, :1, :, :])), dim=1)
                    
                    if self.config.num_control_images == 1:
                        # this is our only control
                        control_latent = inpainting_latent.to(latents.device, latents.dtype)
                        latents = torch.cat((latents, control_latent), dim=1)
                        return latents.detach()
                    
                if control_tensor is None:
                    # concat zeros onto the latents
                    ctrl = torch.zeros(
                        latents.shape[0], # bs
                        latents.shape[1] * self.num_control_images, # ch
                        latents.shape[2], 
                        latents.shape[3], 
                        device=latents.device, 
                        dtype=latents.dtype
                    )
                    if inpainting_latent is not None:
                        # inpainting always comes first
                        ctrl = torch.cat((inpainting_latent, ctrl), dim=1)
                    latents = torch.cat((latents, ctrl), dim=1)
                    return latents.detach()
                # if we have multiple control tensors, they come in like [bs, num_control_images, ch, h, w]
                # if we have 1, it comes in like [bs, ch, h, w]
                # stack out control tensors to be [bs, ch * num_control_images, h, w]
                
                control_tensor = batch.control_tensor.to(latents.device, dtype=latents.dtype)
                
                control_tensor_list = []
                if len(control_tensor.shape) == 4:
                    control_tensor_list.append(control_tensor)
                else:
                    # reshape
                    control_tensor = control_tensor.view(
                        control_tensor.shape[0], 
                        control_tensor.shape[1] * control_tensor.shape[2], 
                        control_tensor.shape[3], 
                        control_tensor.shape[4]
                    )
                    control_tensor_list = control_tensor.chunk(self.num_control_images, dim=1)
                control_latent_list = []
                for control_tensor in control_tensor_list:
                    do_dropout = random.random() < self.config.control_image_dropout
                    if do_dropout:
                        # dropout with noise
                        control_latent_list.append(torch.zeros_like(batch.latents))
                    else:
                        # it is 0-1 need to convert to -1 to 1
                        control_tensor = control_tensor * 2 - 1

                        control_tensor = control_tensor.to(sd.vae_device_torch, dtype=sd.torch_dtype)
                        
                        # if it is not the size of batch.tensor, (bs,ch,h,w) then we need to resize it
                        if control_tensor.shape[2] != batch.tensor.shape[2] or control_tensor.shape[3] != batch.tensor.shape[3]:
                            control_tensor = F.interpolate(control_tensor, size=(batch.tensor.shape[2], batch.tensor.shape[3]), mode='bicubic')
                        
                        # encode it
                        control_latent = sd.encode_images(control_tensor).to(latents.device, latents.dtype)
                        control_latent_list.append(control_latent)
                # stack them on the channel dimension
                control_latent = torch.cat(control_latent_list, dim=1)
                if inpainting_latent is not None:
                    # inpainting always comes first
                    control_latent = torch.cat((inpainting_latent, control_latent), dim=1)
                # concat it onto the latents
                latents = torch.cat((latents, control_latent), dim=1)
                return latents.detach()
            return latents


    def condition_prompt(
            self,
            prompt: Union[List[str], str],
            is_unconditional: bool = False,
    ):
        if self.adapter_type in ['clip_fusion', 'ilora', 'vision_direct', 'redux', 'control_lora', 'subpixel', 'i2v']:
            return prompt
        elif self.adapter_type == 'text_encoder':
            # todo allow for training
            with torch.no_grad():
                # encode and save the embeds
                if is_unconditional:
                    self.unconditional_embeds = self.te_adapter.encode_text(prompt).detach()
                else:
                    self.conditional_embeds = self.te_adapter.encode_text(prompt).detach()
        elif self.adapter_type == 'llm_adapter':
            # todo allow for training
            with torch.no_grad():
                # encode and save the embeds
                if is_unconditional:
                    self.unconditional_embeds = self.llm_adapter.encode_text(prompt).detach()
                else:
                    self.conditional_embeds = self.llm_adapter.encode_text(prompt).detach()
            return prompt
        elif self.adapter_type == 'photo_maker':
            if is_unconditional:
                return prompt
            else:

                with torch.no_grad():
                    was_list = isinstance(prompt, list)
                    if not was_list:
                        prompt_list = [prompt]
                    else:
                        prompt_list = prompt

                    new_prompt_list = []
                    token_mask_list = []

                    for prompt in prompt_list:

                        our_class = None
                        # find a class in the prompt
                        prompt_parts = prompt.split(' ')
                        prompt_parts = [p.strip().lower() for p in prompt_parts if len(p) > 0]

                        new_prompt_parts = []
                        tokened_prompt_parts = []
                        for idx, prompt_part in enumerate(prompt_parts):
                            new_prompt_parts.append(prompt_part)
                            tokened_prompt_parts.append(prompt_part)
                            if prompt_part in self.config.class_names:
                                our_class = prompt_part
                                # add the flag word
                                tokened_prompt_parts.append(self.flag_word)

                                if self.num_control_images > 1:
                                    # add the rest
                                    for _ in range(self.num_control_images - 1):
                                        new_prompt_parts.extend(prompt_parts[idx + 1:])

                                # add the rest
                                tokened_prompt_parts.extend(prompt_parts[idx + 1:])
                                new_prompt_parts.extend(prompt_parts[idx + 1:])

                                break

                        prompt = " ".join(new_prompt_parts)
                        tokened_prompt = " ".join(tokened_prompt_parts)

                        if our_class is None:
                            # add the first one to the front of the prompt
                            tokened_prompt = self.config.class_names[0] + ' ' + self.flag_word + ' ' + prompt
                            our_class = self.config.class_names[0]
                            prompt = " ".join(
                                [self.config.class_names[0] for _ in range(self.num_control_images)]) + ' ' + prompt

                        # add the prompt to the list
                        new_prompt_list.append(prompt)

                        # tokenize them with just the first tokenizer
                        tokenizer = self.sd_ref().tokenizer
                        if isinstance(tokenizer, list):
                            tokenizer = tokenizer[0]

                        flag_token = tokenizer.convert_tokens_to_ids(self.flag_word)

                        tokenized_prompt = tokenizer.encode(prompt)
                        tokenized_tokened_prompt = tokenizer.encode(tokened_prompt)

                        flag_idx = tokenized_tokened_prompt.index(flag_token)

                        class_token = tokenized_prompt[flag_idx - 1]

                        boolean_mask = torch.zeros(flag_idx - 1, dtype=torch.bool)
                        boolean_mask = torch.cat((boolean_mask, torch.ones(self.num_control_images, dtype=torch.bool)))
                        boolean_mask = boolean_mask.to(self.device)
                        # zero pad it to 77
                        boolean_mask = F.pad(boolean_mask, (0, 77 - boolean_mask.shape[0]), value=False)

                        token_mask_list.append(boolean_mask)

                    self.token_mask = torch.cat(token_mask_list, dim=0).to(self.device)

                    prompt_list = new_prompt_list

                    if not was_list:
                        prompt = prompt_list[0]
                    else:
                        prompt = prompt_list

                    return prompt

        else:
            return prompt

    def condition_encoded_embeds(
            self,
            tensors_0_1: torch.Tensor,
            prompt_embeds: PromptEmbeds,
            is_training=False,
            has_been_preprocessed=False,
            is_unconditional=False,
            quad_count=4,
            is_generating_samples=False,
    ) -> PromptEmbeds:
        if self.adapter_type == 'text_encoder':
            # replace the prompt embed with ours
            if is_unconditional:
                return self.unconditional_embeds.clone()
            return self.conditional_embeds.clone()
        if self.adapter_type == 'llm_adapter':
            # replace the prompt embed with ours
            if is_unconditional:
                prompt_embeds.text_embeds = self.unconditional_embeds.text_embeds.clone()
                prompt_embeds.attention_mask = self.unconditional_embeds.attention_mask.clone()
                return prompt_embeds
            prompt_embeds.text_embeds = self.conditional_embeds.text_embeds.clone()
            prompt_embeds.attention_mask = self.conditional_embeds.attention_mask.clone()
            return prompt_embeds

        if self.adapter_type == 'ilora':
            return prompt_embeds

        if self.adapter_type == 'photo_maker' or self.adapter_type == 'clip_fusion' or self.adapter_type == 'redux':
            if is_unconditional:
                # we dont condition the negative embeds for photo maker
                return prompt_embeds.clone()
            with torch.no_grad():
                # on training the clip image is created in the dataloader
                if not has_been_preprocessed:
                    # tensors should be 0-1
                    if tensors_0_1.ndim == 3:
                        tensors_0_1 = tensors_0_1.unsqueeze(0)
                    # training tensors are 0 - 1
                    tensors_0_1 = tensors_0_1.to(self.device, dtype=torch.float16)
                    # if images are out of this range throw error
                    if tensors_0_1.min() < -0.3 or tensors_0_1.max() > 1.3:
                        raise ValueError("image tensor values must be between 0 and 1. Got min: {}, max: {}".format(
                            tensors_0_1.min(), tensors_0_1.max()
                        ))
                    clip_image = self.image_processor(
                        images=tensors_0_1,
                        return_tensors="pt",
                        do_resize=True,
                        do_rescale=False,
                        do_convert_rgb=True
                    ).pixel_values
                else:
                    clip_image = tensors_0_1
                clip_image = clip_image.to(self.device, dtype=get_torch_dtype(self.sd_ref().dtype)).detach()

            if self.config.quad_image:
                # split the 4x4 grid and stack on batch
                ci1, ci2 = clip_image.chunk(2, dim=2)
                ci1, ci3 = ci1.chunk(2, dim=3)
                ci2, ci4 = ci2.chunk(2, dim=3)
                to_cat = []
                for i, ci in enumerate([ci1, ci2, ci3, ci4]):
                    if i < quad_count:
                        to_cat.append(ci)
                    else:
                        break

                clip_image = torch.cat(to_cat, dim=0).detach()

            if self.adapter_type == 'photo_maker':
                # Embeddings need to be  (b, num_inputs, c, h, w) for now, just put 1 input image
                clip_image = clip_image.unsqueeze(1)
                with torch.set_grad_enabled(is_training):
                    if is_training and self.config.train_image_encoder:
                        self.vision_encoder.train()
                        clip_image = clip_image.requires_grad_(True)
                        id_embeds = self.vision_encoder(
                            clip_image,
                            do_projection2=isinstance(self.sd_ref().text_encoder, list),
                        )
                    else:
                        with torch.no_grad():
                            self.vision_encoder.eval()
                            id_embeds = self.vision_encoder(
                                clip_image, do_projection2=isinstance(self.sd_ref().text_encoder, list)
                            ).detach()

                    prompt_embeds.text_embeds = self.fuse_module(
                        prompt_embeds.text_embeds,
                        id_embeds,
                        self.token_mask
                    )
                    return prompt_embeds
            elif self.adapter_type == 'clip_fusion':
                with torch.set_grad_enabled(is_training):
                    if is_training and self.config.train_image_encoder:
                        self.vision_encoder.train()
                        clip_image = clip_image.requires_grad_(True)
                        id_embeds = self.vision_encoder(
                            clip_image,
                            output_hidden_states=True,
                        )
                    else:
                        with torch.no_grad():
                            self.vision_encoder.eval()
                            id_embeds = self.vision_encoder(
                                clip_image, output_hidden_states=True
                            )

                    img_embeds = id_embeds['last_hidden_state']

                    if self.config.quad_image:
                        # get the outputs of the quat
                        chunks = img_embeds.chunk(quad_count, dim=0)
                        chunk_sum = torch.zeros_like(chunks[0])
                        for chunk in chunks:
                            chunk_sum = chunk_sum + chunk
                        # get the mean of them

                        img_embeds = chunk_sum / quad_count

                    if not is_training or not self.config.train_image_encoder:
                        img_embeds = img_embeds.detach()

                    prompt_embeds.text_embeds = self.clip_fusion_module(
                        prompt_embeds.text_embeds,
                        img_embeds
                    )
                    return prompt_embeds

            elif self.adapter_type == 'redux':
                with torch.set_grad_enabled(is_training):
                    if is_training and self.config.train_image_encoder:
                        self.vision_encoder.train()
                        clip_image = clip_image.requires_grad_(True)
                        id_embeds = self.vision_encoder(
                            clip_image,
                            output_hidden_states=True,
                        )
                    else:
                        with torch.no_grad():
                            self.vision_encoder.eval()
                            id_embeds = self.vision_encoder(
                                clip_image, output_hidden_states=True
                            )

                    img_embeds = id_embeds['last_hidden_state']

                    if self.config.quad_image:
                        # get the outputs of the quat
                        chunks = img_embeds.chunk(quad_count, dim=0)
                        chunk_sum = torch.zeros_like(chunks[0])
                        for chunk in chunks:
                            chunk_sum = chunk_sum + chunk
                        # get the mean of them

                        img_embeds = chunk_sum / quad_count

                    if not is_training or not self.config.train_image_encoder:
                        img_embeds = img_embeds.detach()
                    
                    img_embeds = self.redux_adapter(img_embeds.to(self.device, get_torch_dtype(self.sd_ref().dtype)))

                    prompt_embeds.text_embeds = torch.cat((prompt_embeds.text_embeds, img_embeds), dim=-2)
                    return prompt_embeds
        else:
            return prompt_embeds

    def get_empty_clip_image(self, batch_size: int, shape=None) -> torch.Tensor:
        with torch.no_grad():
            if shape is None:
                shape = [batch_size, 3, self.input_size, self.input_size]
            tensors_0_1 = torch.rand(shape, device=self.device)
            noise_scale = torch.rand([tensors_0_1.shape[0], 1, 1, 1], device=self.device,
                                     dtype=get_torch_dtype(self.sd_ref().dtype))
            tensors_0_1 = tensors_0_1 * noise_scale
            # tensors_0_1 = tensors_0_1 * 0
            mean = torch.tensor(self.clip_image_processor.image_mean).to(
                self.device, dtype=get_torch_dtype(self.sd_ref().dtype)
            ).detach()
            std = torch.tensor(self.clip_image_processor.image_std).to(
                self.device, dtype=get_torch_dtype(self.sd_ref().dtype)
            ).detach()
            tensors_0_1 = torch.clip((255. * tensors_0_1), 0, 255).round() / 255.0
            clip_image = (tensors_0_1 - mean.view([1, 3, 1, 1])) / std.view([1, 3, 1, 1])
        return clip_image.detach()

    def train(self, mode: bool = True):
        if self.config.train_image_encoder:
            self.vision_encoder.train(mode)
        super().train(mode)

    def trigger_pre_te(
            self,
            tensors_0_1: Optional[torch.Tensor]=None,
            tensors_preprocessed: Optional[torch.Tensor]=None, # preprocessed by the dataloader
            is_training=False,
            has_been_preprocessed=False,
            batch_tensor: Optional[torch.Tensor]=None,
            quad_count=4,
            batch_size=1,
    ) -> PromptEmbeds:
        if tensors_0_1 is not None:
            # actual 0 - 1 image
            self.cached_control_image_0_1 = tensors_0_1
        else:
            # image has been processed through the dataloader and is prepped for vision encoder
            self.cached_control_image_0_1 = None
        if batch_tensor is not None and self.cached_control_image_0_1 is None:
            # convert it to 0 - 1
            to_cache = batch_tensor / 2 + 0.5
            # videos come in (bs, num_frames, channels, height, width)
            # images come in (bs, channels, height, width)
            # if it is a video, just grad first frame
            if len(to_cache.shape) == 5:
                to_cache = to_cache[:, 0:1, :, :, :]
                to_cache = to_cache.squeeze(1)
            self.cached_control_image_0_1 = to_cache
        
        if tensors_preprocessed is not None and has_been_preprocessed:
            tensors_0_1 = tensors_preprocessed
        # if self.adapter_type == 'ilora' or self.adapter_type == 'vision_direct' or self.adapter_type == 'te_augmenter':
        if self.adapter_type in ['ilora', 'vision_direct', 'te_augmenter', 'i2v']:
            skip_unconditional = self.sd_ref().is_flux
            if tensors_0_1 is None:
                tensors_0_1 = self.get_empty_clip_image(batch_size)
                has_been_preprocessed = True

            with torch.no_grad():
                # on training the clip image is created in the dataloader
                if not has_been_preprocessed:
                    # tensors should be 0-1
                    if tensors_0_1.ndim == 3:
                        tensors_0_1 = tensors_0_1.unsqueeze(0)
                    # training tensors are 0 - 1
                    tensors_0_1 = tensors_0_1.to(self.device, dtype=torch.float16)
                    # if images are out of this range throw error
                    if tensors_0_1.min() < -0.3 or tensors_0_1.max() > 1.3:
                        raise ValueError("image tensor values must be between 0 and 1. Got min: {}, max: {}".format(
                            tensors_0_1.min(), tensors_0_1.max()
                        ))
                    clip_image = self.image_processor(
                        images=tensors_0_1,
                        return_tensors="pt",
                        do_resize=True,
                        do_rescale=False,
                    ).pixel_values
                else:
                    clip_image = tensors_0_1
                    
                # if is pixtral
                if self.config.image_encoder_arch == 'pixtral' and self.config.pixtral_random_image_size:
                    # get the random size
                    random_size = random.randint(256, self.config.pixtral_max_image_size)
                    # images are already sized for max size, we have to fit them to the pixtral patch size to reduce / enlarge it farther.
                    h, w = clip_image.shape[2], clip_image.shape[3]
                    current_base_size = int(math.sqrt(w * h))
                    ratio = current_base_size / random_size
                    if ratio > 1:
                        w = round(w / ratio)
                        h = round(h / ratio)

                    width_tokens = (w - 1) // self.image_processor.image_patch_size + 1
                    height_tokens = (h - 1) // self.image_processor.image_patch_size + 1
                    assert width_tokens > 0
                    assert height_tokens > 0
                    
                    new_image_size = (
                        width_tokens * self.image_processor.image_patch_size,
                        height_tokens * self.image_processor.image_patch_size,
                    )
                    
                    # resize the image
                    clip_image = F.interpolate(clip_image, size=new_image_size, mode='bicubic', align_corners=False)
                    

                batch_size = clip_image.shape[0]
                if self.config.control_image_dropout > 0 and is_training:
                    clip_batch = torch.chunk(clip_image, batch_size, dim=0)
                    unconditional_batch = torch.chunk(self.get_empty_clip_image(batch_size, shape=clip_image.shape).to(
                        clip_image.device, dtype=clip_image.dtype
                    ), batch_size, dim=0)
                    combine_list = []
                    for i in range(batch_size):
                        do_dropout = random.random() < self.config.control_image_dropout
                        if do_dropout:
                            # dropout with noise
                            combine_list.append(unconditional_batch[i])
                        else:
                            combine_list.append(clip_batch[i])
                    clip_image = torch.cat(combine_list, dim=0)
                
                if self.adapter_type in ['vision_direct', 'te_augmenter', 'i2v'] and not skip_unconditional:
                    # add an unconditional so we can save it
                    unconditional = self.get_empty_clip_image(batch_size, shape=clip_image.shape).to(
                        clip_image.device, dtype=clip_image.dtype
                    )
                    clip_image = torch.cat([unconditional, clip_image], dim=0)

                clip_image = clip_image.to(self.device, dtype=get_torch_dtype(self.sd_ref().dtype)).detach()

            if self.config.quad_image:
                # split the 4x4 grid and stack on batch
                ci1, ci2 = clip_image.chunk(2, dim=2)
                ci1, ci3 = ci1.chunk(2, dim=3)
                ci2, ci4 = ci2.chunk(2, dim=3)
                to_cat = []
                for i, ci in enumerate([ci1, ci2, ci3, ci4]):
                    if i < quad_count:
                        to_cat.append(ci)
                    else:
                        break

                clip_image = torch.cat(to_cat, dim=0).detach()

            if self.adapter_type == 'ilora':
                with torch.set_grad_enabled(is_training):
                    if is_training and self.config.train_image_encoder:
                        self.vision_encoder.train()
                        clip_image = clip_image.requires_grad_(True)
                        id_embeds = self.vision_encoder(
                            clip_image,
                            output_hidden_states=True,
                        )
                    else:
                        with torch.no_grad():
                            self.vision_encoder.eval()
                            id_embeds = self.vision_encoder(
                                clip_image, output_hidden_states=True
                            )

                    if self.config.clip_layer == 'penultimate_hidden_states':
                        img_embeds = id_embeds.hidden_states[-2]
                    elif self.config.clip_layer == 'last_hidden_state':
                        img_embeds = id_embeds.hidden_states[-1]
                    elif self.config.clip_layer == 'image_embeds':
                        img_embeds = id_embeds.image_embeds
                    else:
                        raise ValueError(f"unknown clip layer: {self.config.clip_layer}")

                    if self.config.quad_image:
                        # get the outputs of the quat
                        chunks = img_embeds.chunk(quad_count, dim=0)
                        chunk_sum = torch.zeros_like(chunks[0])
                        for chunk in chunks:
                            chunk_sum = chunk_sum + chunk
                        # get the mean of them

                        img_embeds = chunk_sum / quad_count

                    if not is_training or not self.config.train_image_encoder:
                        img_embeds = img_embeds.detach()

                    self.ilora_module(img_embeds)
            # if self.adapter_type == 'vision_direct' or self.adapter_type == 'te_augmenter':
            if self.adapter_type in ['vision_direct', 'te_augmenter', 'i2v']:
                with torch.set_grad_enabled(is_training):
                    if is_training and self.config.train_image_encoder:
                        self.vision_encoder.train()
                        clip_image = clip_image.requires_grad_(True)
                    else:
                        with torch.no_grad():
                            self.vision_encoder.eval()
                    self.vision_encoder.to(self.device)
                    clip_output = self.vision_encoder(
                        clip_image.to(self.device, dtype=get_torch_dtype(self.sd_ref().dtype)),
                        output_hidden_states=True,
                    )
                    if self.config.clip_layer == 'penultimate_hidden_states':
                        # they skip last layer for ip+
                        # https://github.com/tencent-ailab/IP-Adapter/blob/f4b6742db35ea6d81c7b829a55b0a312c7f5a677/tutorial_train_plus.py#L403C26-L403C26
                        clip_image_embeds = clip_output.hidden_states[-2]
                    elif self.config.clip_layer == 'last_hidden_state':
                        clip_image_embeds = clip_output.hidden_states[-1]
                    else:
                        if hasattr(clip_output, 'image_embeds'):
                            clip_image_embeds = clip_output.image_embeds
                        elif hasattr(clip_output, 'pooler_output'):
                            clip_image_embeds = clip_output.pooler_output
                        # TODO should we always norm image embeds?
                        # get norm embeddings
                        # l2_norm = torch.norm(clip_image_embeds, p=2)
                        # clip_image_embeds = clip_image_embeds / l2_norm

                    if not is_training or not self.config.train_image_encoder:
                        clip_image_embeds = clip_image_embeds.detach()

                    if self.adapter_type == 'te_augmenter':
                        clip_image_embeds = self.te_augmenter(clip_image_embeds)

                    if self.adapter_type == 'vision_direct':
                        clip_image_embeds = self.vd_adapter(clip_image_embeds)

                    # save them to the conditional and unconditional
                    try:
                        if skip_unconditional:
                            self.unconditional_embeds, self.conditional_embeds = None, clip_image_embeds
                        else:
                            self.unconditional_embeds, self.conditional_embeds = clip_image_embeds.chunk(2, dim=0)
                    except ValueError:
                        raise ValueError(f"could not split the clip image embeds into 2. Got shape: {clip_image_embeds.shape}")

    def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
        if self.config.train_only_image_encoder:
            yield from self.vision_encoder.parameters(recurse)
            return
        if self.config.type == 'photo_maker':
            yield from self.fuse_module.parameters(recurse)
            if self.config.train_image_encoder:
                yield from self.vision_encoder.parameters(recurse)
        elif self.config.type == 'clip_fusion':
            yield from self.clip_fusion_module.parameters(recurse)
            if self.config.train_image_encoder:
                yield from self.vision_encoder.parameters(recurse)
        elif self.config.type == 'ilora':
            yield from self.ilora_module.parameters(recurse)
            if self.config.train_image_encoder:
                yield from self.vision_encoder.parameters(recurse)
        elif self.config.type == 'text_encoder':
            for attn_processor in self.te_adapter.adapter_modules:
                yield from attn_processor.parameters(recurse)
        elif self.config.type == 'llm_adapter':
            yield from self.llm_adapter.parameters(recurse)
        elif self.config.type == 'vision_direct':
            if self.config.train_scaler:
                # only yield the self.block_scaler = torch.nn.Parameter(torch.tensor([1.0] * num_modules)
                yield self.vd_adapter.block_scaler
            else:
                for attn_processor in self.vd_adapter.adapter_modules:
                    yield from attn_processor.parameters(recurse)
                if self.config.train_image_encoder:
                    yield from self.vision_encoder.parameters(recurse)
                if self.vd_adapter.resampler is not None:
                    yield from self.vd_adapter.resampler.parameters(recurse)
                if self.vd_adapter.pool is not None:
                    yield from self.vd_adapter.pool.parameters(recurse)
                if self.vd_adapter.sparse_autoencoder is not None:
                    yield from self.vd_adapter.sparse_autoencoder.parameters(recurse)
        elif self.config.type == 'te_augmenter':
            yield from self.te_augmenter.parameters(recurse)
            if self.config.train_image_encoder:
                yield from self.vision_encoder.parameters(recurse)
        elif self.config.type == 'single_value':
            yield from self.single_value_adapter.parameters(recurse)
        elif self.config.type == 'redux':
            yield from self.redux_adapter.parameters(recurse)
        elif self.config.type == 'control_lora':
            param_list = self.control_lora.get_params()
            for param in param_list:
                yield param
        elif self.config.type == 'i2v':
            param_list = self.i2v_adapter.get_params()
            for param in param_list:
                yield param
        elif self.config.type == 'subpixel':
            param_list = self.subpixel_adapter.get_params()
            for param in param_list:
                yield param
        else:
            raise NotImplementedError

    def enable_gradient_checkpointing(self):
        if hasattr(self.vision_encoder, "enable_gradient_checkpointing"):
            self.vision_encoder.enable_gradient_checkpointing()
        elif hasattr(self.vision_encoder, 'gradient_checkpointing'):
            self.vision_encoder.gradient_checkpointing = True

    def get_additional_save_metadata(self) -> Dict[str, Any]:
        additional = {}
        if self.config.type == 'ilora':
            extra = self.ilora_module.get_additional_save_metadata()
            for k, v in extra.items():
                additional[k] = v
            additional['clip_layer'] = self.config.clip_layer
            additional['image_encoder_arch'] = self.config.head_dim
        return additional

    def post_weight_update(self):
        # do any kind of updates after the weight update
        if self.config.type == 'vision_direct':
            self.vd_adapter.post_weight_update()
        pass