Spaces:

kfirgold99
/

Piece-it-Together-Space

Running on Zero

App Files Files Community

kfirbria commited on Apr 15

Commit

779c9ab

1 Parent(s): 14940b9

add demo

Browse files

Files changed (36) hide show

.gradio/certificate.pem +31 -0
.python-version +1 -0
app.py +69 -0
assets/characters_parts/part_a.jpg +0 -0
assets/characters_parts/part_b.jpg +0 -0
assets/characters_parts/part_c.jpg +0 -0
ip_adapter/__init__.py +9 -0
ip_adapter/__pycache__/__init__.cpython-312.pyc +0 -0
ip_adapter/__pycache__/attention_processor.cpython-312.pyc +0 -0
ip_adapter/__pycache__/ip_adapter.cpython-312.pyc +0 -0
ip_adapter/__pycache__/resampler.cpython-312.pyc +0 -0
ip_adapter/__pycache__/utils.cpython-312.pyc +0 -0
ip_adapter/attention_processor.py +568 -0
ip_adapter/attention_processor_faceid.py +433 -0
ip_adapter/custom_pipelines.py +394 -0
ip_adapter/ip_adapter.py +457 -0
ip_adapter/ip_adapter_faceid.py +542 -0
ip_adapter/ip_adapter_faceid_separate.py +556 -0
ip_adapter/resampler.py +158 -0
ip_adapter/sd3_attention_processor.py +179 -0
ip_adapter/test_resampler.py +44 -0
ip_adapter/utils.py +93 -0
model/__init__.py +0 -0
model/__pycache__/__init__.cpython-312.pyc +0 -0
model/__pycache__/dit.cpython-312.pyc +0 -0
model/__pycache__/pipeline_pit.cpython-312.pyc +0 -0
model/dit.py +313 -0
model/pipeline_pit.py +106 -0
pit.py +161 -0
requirements.txt +6 -0
training/__init__.py +0 -0
training/__pycache__/__init__.cpython-312.pyc +0 -0
training/__pycache__/train_config.cpython-312.pyc +0 -0
training/coach.py +409 -0
training/dataset.py +182 -0
training/train_config.py +67 -0

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

app.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import gradio as gr
+import spaces
+from pit import PiTDemoPipeline
+BLOCK_WIDTH = 300
+BLOCK_HEIGHT = 360
+FONT_SIZE = 3.5
+pit_pipeline = PiTDemoPipeline(
+    prior_repo="kfirgold99/Piece-it-Together", prior_path="models/characters_ckpt/prior.ckpt"
+)
+@spaces.GPU
+def run_character_generation(part_1, part_2, part_3, seed=None):
+    crops_paths = [part_1, part_2, part_3]
+    image = pit_pipeline.run(crops_paths=crops_paths, seed=seed, n_images=1)[0]
+    return image
+with gr.Blocks(css="style.css") as demo:
+    gr.HTML(
+        """<div style="text-align: center;"><h1>Piece it Together: Part-Based Concepting with IP-Priors</h1></div>"""
+    )
+    gr.HTML(
+        '<div style="text-align: center;"><h3><a href="https://eladrich.github.io/PiT/">https://eladrich.github.io/PiT/</a></h3></div>'
+    )
+    gr.HTML(
+        '<div style="text-align: center;">Piece it Together (PiT) combines different input parts to generate a complete concept in a prior domain.</div>'
+    )
+    with gr.Row(equal_height=True, elem_classes="justified-element"):
+        with gr.Column(scale=0, min_width=BLOCK_WIDTH):
+            part_1 = gr.Image(
+                label="Upload part 1 (or keep empty)", type="filepath", width=BLOCK_WIDTH, height=BLOCK_HEIGHT
+            )
+        with gr.Column(scale=0, min_width=BLOCK_WIDTH):
+            part_2 = gr.Image(
+                label="Upload part 2 (or keep empty)", type="filepath", width=BLOCK_WIDTH, height=BLOCK_HEIGHT
+            )
+        with gr.Column(scale=0, min_width=BLOCK_WIDTH):
+            part_3 = gr.Image(
+                label="Upload part 3 (or keep empty)", type="filepath", width=BLOCK_WIDTH, height=BLOCK_HEIGHT
+            )
+        with gr.Column(scale=0, min_width=BLOCK_WIDTH):
+            output_eq_1 = gr.Image(label="Output", width=BLOCK_WIDTH, height=BLOCK_HEIGHT)
+    with gr.Row(equal_height=True, elem_classes="justified-element"):
+        run_button = gr.Button("Create your character!", elem_classes="small-elem")
+        run_button.click(fn=run_character_generation, inputs=[part_1, part_2, part_3], outputs=[output_eq_1])
+    with gr.Row(equal_height=True, elem_classes="justified-element"):
+        pass
+    with gr.Row(equal_height=True, elem_classes="justified-element"):
+        with gr.Column(scale=1):
+            examples = [
+                [
+                    "assets/characters_parts/part_a.jpg",
+                    "assets/characters_parts/part_b.jpg",
+                    "assets/characters_parts/part_c.jpg",
+                ]
+            ]
+            gr.Examples(
+                examples=examples,
+                inputs=[part_1, part_2, part_3],
+                outputs=[output_eq_1],
+                fn=run_character_generation,
+                cache_examples=False,
+            )
+demo.queue().launch(share=True)

assets/characters_parts/part_a.jpg ADDED Viewed

assets/characters_parts/part_b.jpg ADDED Viewed

assets/characters_parts/part_c.jpg ADDED Viewed

ip_adapter/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .ip_adapter import IPAdapter, IPAdapterPlus, IPAdapterPlusXL, IPAdapterXL, IPAdapterFull
+__all__ = [
+    "IPAdapter",
+    "IPAdapterPlus",
+    "IPAdapterPlusXL",
+    "IPAdapterXL",
+    "IPAdapterFull",
+]

ip_adapter/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (362 Bytes). View file

ip_adapter/__pycache__/attention_processor.cpython-312.pyc ADDED Viewed

Binary file (21 kB). View file

ip_adapter/__pycache__/ip_adapter.cpython-312.pyc ADDED Viewed

Binary file (22.3 kB). View file

ip_adapter/__pycache__/resampler.cpython-312.pyc ADDED Viewed

Binary file (7.82 kB). View file

ip_adapter/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (4.69 kB). View file

ip_adapter/attention_processor.py ADDED Viewed

	@@ -0,0 +1,568 @@

+# modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class AttnProcessor(nn.Module):
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __init__(
+        self,
+        hidden_size=None,
+        cross_attention_dim=None,
+    ):
+        super().__init__()
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        *args,
+        **kwargs,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class IPAttnProcessor(nn.Module):
+    r"""
+    Attention processor for IP-Adapater.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
+            The context length of the image features.
+    """
+    def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_tokens = num_tokens
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        *args,
+        **kwargs,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            # get encoder_hidden_states, ip_hidden_states
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = (
+                encoder_hidden_states[:, :end_pos, :],
+                encoder_hidden_states[:, end_pos:, :],
+            )
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+        ip_key = attn.head_to_batch_dim(ip_key)
+        ip_value = attn.head_to_batch_dim(ip_value)
+        ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
+        self.attn_map = ip_attention_probs
+        ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
+        ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class AttnProcessor2_0(torch.nn.Module):
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(
+        self,
+        hidden_size=None,
+        cross_attention_dim=None,
+    ):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        *args,
+        **kwargs,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class IPAttnProcessor2_0(torch.nn.Module):
+    r"""
+    Attention processor for IP-Adapater for PyTorch 2.0.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
+            The context length of the image features.
+    """
+    def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_tokens = num_tokens
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        *args,
+        **kwargs,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            # get encoder_hidden_states, ip_hidden_states
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = (
+                encoder_hidden_states[:, :end_pos, :],
+                encoder_hidden_states[:, end_pos:, :],
+            )
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        ip_hidden_states = F.scaled_dot_product_attention(
+            query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+        )
+        with torch.no_grad():
+            self.attn_map = query @ ip_key.transpose(-2, -1).softmax(dim=-1)
+            #print(self.attn_map.shape)
+        ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        ip_hidden_states = ip_hidden_states.to(query.dtype)
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+## for controlnet
+class CNAttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __init__(self, num_tokens=4):
+        self.num_tokens = num_tokens
+    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, temb=None, *args, **kwargs,):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states = encoder_hidden_states[:, :end_pos]  # only use text
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class CNAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(self, num_tokens=4):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.num_tokens = num_tokens
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        *args,
+        **kwargs,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states = encoder_hidden_states[:, :end_pos]  # only use text
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

ip_adapter/attention_processor_faceid.py ADDED Viewed

	@@ -0,0 +1,433 @@

+# modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.models.lora import LoRALinearLayer
+class LoRAAttnProcessor(nn.Module):
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __init__(
+        self,
+        hidden_size=None,
+        cross_attention_dim=None,
+        rank=4,
+        network_alpha=None,
+        lora_scale=1.0,
+    ):
+        super().__init__()
+        self.rank = rank
+        self.lora_scale = lora_scale
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        *args,
+        **kwargs,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states) + self.lora_scale * self.to_q_lora(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + self.lora_scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class LoRAIPAttnProcessor(nn.Module):
+    r"""
+    Attention processor for IP-Adapater.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
+            The context length of the image features.
+    """
+    def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None, lora_scale=1.0, scale=1.0, num_tokens=4):
+        super().__init__()
+        self.rank = rank
+        self.lora_scale = lora_scale
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_tokens = num_tokens
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        *args,
+        **kwargs,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states) + self.lora_scale * self.to_q_lora(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            # get encoder_hidden_states, ip_hidden_states
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = (
+                encoder_hidden_states[:, :end_pos, :],
+                encoder_hidden_states[:, end_pos:, :],
+            )
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+        ip_key = attn.head_to_batch_dim(ip_key)
+        ip_value = attn.head_to_batch_dim(ip_value)
+        ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
+        self.attn_map = ip_attention_probs
+        ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
+        ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + self.lora_scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class LoRAAttnProcessor2_0(nn.Module):
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __init__(
+        self,
+        hidden_size=None,
+        cross_attention_dim=None,
+        rank=4,
+        network_alpha=None,
+        lora_scale=1.0,
+    ):
+        super().__init__()
+        self.rank = rank
+        self.lora_scale = lora_scale
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        *args,
+        **kwargs,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states) + self.lora_scale * self.to_q_lora(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + self.lora_scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class LoRAIPAttnProcessor2_0(nn.Module):
+    r"""
+    Processor for implementing the LoRA attention mechanism.
+    Args:
+        hidden_size (`int`, *optional*):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the `encoder_hidden_states`.
+        rank (`int`, defaults to 4):
+            The dimension of the LoRA update matrices.
+        network_alpha (`int`, *optional*):
+            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
+    """
+    def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None, lora_scale=1.0, scale=1.0, num_tokens=4):
+        super().__init__()
+        self.rank = rank
+        self.lora_scale = lora_scale
+        self.num_tokens = num_tokens
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+    def __call__(
+        self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0, temb=None, *args, **kwargs,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states) + self.lora_scale * self.to_q_lora(hidden_states)
+        #query = attn.head_to_batch_dim(query)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            # get encoder_hidden_states, ip_hidden_states
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = (
+                encoder_hidden_states[:, :end_pos, :],
+                encoder_hidden_states[:, end_pos:, :],
+            )
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        # for text
+        key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # for ip
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        ip_hidden_states = F.scaled_dot_product_attention(
+            query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+        )
+        ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        ip_hidden_states = ip_hidden_states.to(query.dtype)
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + self.lora_scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

ip_adapter/custom_pipelines.py ADDED Viewed

	@@ -0,0 +1,394 @@

+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import torch
+from diffusers import StableDiffusionXLPipeline
+from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
+from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import rescale_noise_cfg
+from .utils import is_torch2_available
+if is_torch2_available():
+    from .attention_processor import IPAttnProcessor2_0 as IPAttnProcessor
+else:
+    from .attention_processor import IPAttnProcessor
+class StableDiffusionXLCustomPipeline(StableDiffusionXLPipeline):
+    def set_scale(self, scale):
+        for attn_processor in self.unet.attn_processors.values():
+            if isinstance(attn_processor, IPAttnProcessor):
+                attn_processor.scale = scale
+    @torch.no_grad()
+    def __call__(  # noqa: C901
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        control_guidance_start: float = 0.0,
+        control_guidance_end: float = 1.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.7):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            control_guidance_start (`float`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        )
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        # 7.1 Apply denoising_end
+        if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+        # get init conditioning scale
+        for attn_processor in self.unet.attn_processors.values():
+            if isinstance(attn_processor, IPAttnProcessor):
+                conditioning_scale = attn_processor.scale
+                break
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if (i / len(timesteps) < control_guidance_start) or ((i + 1) / len(timesteps) > control_guidance_end):
+                    self.set_scale(0.0)
+                else:
+                    self.set_scale(conditioning_scale)
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+        if output_type != "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return StableDiffusionXLPipelineOutput(images=image)

ip_adapter/ip_adapter.py ADDED Viewed

	@@ -0,0 +1,457 @@

+import os
+from typing import List
+import torch
+from diffusers import StableDiffusionPipeline
+from diffusers.pipelines.controlnet import MultiControlNetModel
+from PIL import Image
+from safetensors import safe_open
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from .utils import is_torch2_available, get_generator
+if is_torch2_available():
+    from .attention_processor import (
+        AttnProcessor2_0 as AttnProcessor,
+    )
+    from .attention_processor import (
+        CNAttnProcessor2_0 as CNAttnProcessor,
+    )
+    from .attention_processor import (
+        IPAttnProcessor2_0 as IPAttnProcessor,
+    )
+else:
+    from .attention_processor import AttnProcessor, CNAttnProcessor, IPAttnProcessor
+from .resampler import Resampler
+class ImageProjModel(torch.nn.Module):
+    """Projection Model"""
+    def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4):
+        super().__init__()
+        self.generator = None
+        self.cross_attention_dim = cross_attention_dim
+        self.clip_extra_context_tokens = clip_extra_context_tokens
+        self.proj = torch.nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim)
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+    def forward(self, image_embeds):
+        embeds = image_embeds
+        clip_extra_context_tokens = self.proj(embeds).reshape(
+            -1, self.clip_extra_context_tokens, self.cross_attention_dim
+        )
+        clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
+        return clip_extra_context_tokens
+class MLPProjModel(torch.nn.Module):
+    """SD model with image prompt"""
+    def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024):
+        super().__init__()
+        self.proj = torch.nn.Sequential(
+            torch.nn.Linear(clip_embeddings_dim, clip_embeddings_dim),
+            torch.nn.GELU(),
+            torch.nn.Linear(clip_embeddings_dim, cross_attention_dim),
+            torch.nn.LayerNorm(cross_attention_dim),
+        )
+    def forward(self, image_embeds):
+        clip_extra_context_tokens = self.proj(image_embeds)
+        return clip_extra_context_tokens
+class IPAdapter:
+    def __init__(self, sd_pipe, image_encoder_path, ip_ckpt, device, num_tokens=4):
+        self.device = device
+        self.image_encoder_path = image_encoder_path
+        self.ip_ckpt = ip_ckpt
+        self.num_tokens = num_tokens
+        self.pipe = sd_pipe.to(self.device)
+        self.set_ip_adapter()
+        # load image encoder
+        # self.image_encoder = CLIPVisionModelWithProjection.from_pretrained("h94/IP-Adapter", subfolder="models/image_encoder").to(
+        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+            "h94/IP-Adapter", subfolder=image_encoder_path
+        ).to(self.device, dtype=torch.float16)
+        self.clip_image_processor = CLIPImageProcessor()
+        # image proj model
+        self.image_proj_model = self.init_proj()
+        self.load_ip_adapter()
+    def init_proj(self):
+        image_proj_model = ImageProjModel(
+            cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
+            clip_embeddings_dim=self.image_encoder.config.projection_dim,
+            clip_extra_context_tokens=self.num_tokens,
+        ).to(self.device, dtype=torch.float16)
+        return image_proj_model
+    def set_ip_adapter(self):
+        unet = self.pipe.unet
+        attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_procs[name] = AttnProcessor()
+            else:
+                attn_procs[name] = IPAttnProcessor(
+                    hidden_size=hidden_size,
+                    cross_attention_dim=cross_attention_dim,
+                    scale=1.0,
+                    num_tokens=self.num_tokens,
+                ).to(self.device, dtype=torch.float16)
+        unet.set_attn_processor(attn_procs)
+        if hasattr(self.pipe, "controlnet"):
+            if isinstance(self.pipe.controlnet, MultiControlNetModel):
+                for controlnet in self.pipe.controlnet.nets:
+                    controlnet.set_attn_processor(CNAttnProcessor(num_tokens=self.num_tokens))
+            else:
+                self.pipe.controlnet.set_attn_processor(CNAttnProcessor(num_tokens=self.num_tokens))
+    def load_ip_adapter(self):
+        if os.path.splitext(self.ip_ckpt)[-1] == ".safetensors":
+            state_dict = {"image_proj": {}, "ip_adapter": {}}
+            with safe_open(self.ip_ckpt, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    if key.startswith("image_proj."):
+                        state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
+                    elif key.startswith("ip_adapter."):
+                        state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
+        else:
+            state_dict = torch.load(self.ip_ckpt, map_location="cpu")
+        self.image_proj_model.load_state_dict(state_dict["image_proj"])
+        ip_layers = torch.nn.ModuleList(self.pipe.unet.attn_processors.values())
+        ip_layers.load_state_dict(state_dict["ip_adapter"])
+    def save_ip_adapter(self, save_path):
+        state_dict = {
+            "image_proj": self.image_proj_model.state_dict(),
+            "ip_adapter": torch.nn.ModuleList(self.pipe.unet.attn_processors.values()).state_dict(),
+        }
+        torch.save(state_dict, save_path)
+    @torch.inference_mode()
+    def get_image_embeds(self, pil_image=None, clip_image_embeds=None):
+        if pil_image is not None:
+            if isinstance(pil_image, Image.Image):
+                pil_image = [pil_image]
+            clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
+            clip_image_embeds = self.image_encoder(clip_image.to(self.device, dtype=torch.float16)).image_embeds
+        else:
+            clip_image_embeds = clip_image_embeds.to(self.device, dtype=torch.float16)
+        image_prompt_embeds = self.image_proj_model(clip_image_embeds)
+        uncond_image_prompt_embeds = self.image_proj_model(torch.zeros_like(clip_image_embeds))
+        return image_prompt_embeds, uncond_image_prompt_embeds
+    def set_scale(self, scale):
+        for attn_processor in self.pipe.unet.attn_processors.values():
+            if isinstance(attn_processor, IPAttnProcessor):
+                attn_processor.scale = scale
+    def generate(
+        self,
+        pil_image=None,
+        clip_image_embeds=None,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        guidance_scale=7.5,
+        num_inference_steps=30,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+        if pil_image is not None:
+            num_prompts = 1 if isinstance(pil_image, Image.Image) else len(pil_image)
+        else:
+            num_prompts = clip_image_embeds.size(0)
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(
+            pil_image=pil_image, clip_image_embeds=clip_image_embeds
+        )
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        with torch.inference_mode():
+            prompt_embeds_, negative_prompt_embeds_ = self.pipe.encode_prompt(
+                prompt,
+                device=self.device,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds_, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds_, uncond_image_prompt_embeds], dim=1)
+        generator = get_generator(seed, self.device)
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            **kwargs,
+        ).images
+        return images
+class IPAdapterXL(IPAdapter):
+    """SDXL"""
+    def generate(
+        self,
+        pil_image,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        num_inference_steps=30,
+        image_prompt_embeds=None,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+        if pil_image is not None:
+            num_prompts = 1 if isinstance(pil_image, Image.Image) else len(pil_image)
+        else:
+            num_prompts = 1
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+        if pil_image is None:
+            assert image_prompt_embeds is not None
+            clip_image = self.clip_image_processor(
+                images=[Image.new("RGB", (128, 128))], return_tensors="pt"
+            ).pixel_values
+            clip_image = clip_image.to(self.device, dtype=torch.float16)
+            uncond_clip_image_embeds = self.image_encoder(torch.zeros_like(clip_image)).image_embeds
+            uncond_image_prompt_embeds = self.image_proj_model(uncond_clip_image_embeds)
+        else:
+            image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(pil_image)
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        with torch.inference_mode():
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+                pooled_prompt_embeds,
+                negative_pooled_prompt_embeds,
+            ) = self.pipe.encode_prompt(
+                prompt,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_prompt_embeds], dim=1)
+        self.generator = get_generator(seed, self.device)
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            num_inference_steps=num_inference_steps,
+            generator=self.generator,
+            **kwargs,
+        ).images
+        return images
+class IPAdapterPlus(IPAdapter):
+    """IP-Adapter with fine-grained features"""
+    def init_proj(self):
+        image_proj_model = Resampler(
+            dim=self.pipe.unet.config.cross_attention_dim,
+            depth=4,
+            dim_head=64,
+            heads=12,
+            num_queries=self.num_tokens,
+            embedding_dim=self.image_encoder.config.hidden_size,
+            output_dim=self.pipe.unet.config.cross_attention_dim,
+            ff_mult=4,
+        ).to(self.device, dtype=torch.float16)
+        return image_proj_model
+    @torch.inference_mode()
+    def get_image_embeds(self, pil_image=None, clip_image_embeds=None):
+        if isinstance(pil_image, Image.Image):
+            pil_image = [pil_image]
+        clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
+        clip_image = clip_image.to(self.device, dtype=torch.float16)
+        clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
+        image_prompt_embeds = self.image_proj_model(clip_image_embeds)
+        uncond_clip_image_embeds = self.image_encoder(
+            torch.zeros_like(clip_image), output_hidden_states=True
+        ).hidden_states[-2]
+        uncond_image_prompt_embeds = self.image_proj_model(uncond_clip_image_embeds)
+        return image_prompt_embeds, uncond_image_prompt_embeds
+class IPAdapterFull(IPAdapterPlus):
+    """IP-Adapter with full features"""
+    def init_proj(self):
+        image_proj_model = MLPProjModel(
+            cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
+            clip_embeddings_dim=self.image_encoder.config.hidden_size,
+        ).to(self.device, dtype=torch.float16)
+        return image_proj_model
+class IPAdapterPlusXL(IPAdapter):
+    """SDXL"""
+    def init_proj(self):
+        image_proj_model = Resampler(
+            dim=1280,
+            depth=4,
+            dim_head=64,
+            heads=20,
+            num_queries=self.num_tokens,
+            embedding_dim=self.image_encoder.config.hidden_size,
+            output_dim=self.pipe.unet.config.cross_attention_dim,
+            ff_mult=4,
+        ).to(self.device, dtype=torch.float16)
+        return image_proj_model
+    @torch.inference_mode()
+    def get_image_embeds(self, pil_image, skip_uncond=False):
+        if isinstance(pil_image, Image.Image):
+            pil_image = [pil_image]
+            clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
+        else:
+            clip_image = pil_image
+        clip_image = clip_image.to(self.device, dtype=torch.float16)
+        clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
+        image_prompt_embeds = self.image_proj_model(clip_image_embeds)
+        if skip_uncond:
+            return image_prompt_embeds
+        uncond_clip_image_embeds = self.image_encoder(
+            torch.zeros_like(clip_image), output_hidden_states=True
+        ).hidden_states[-2]
+        uncond_image_prompt_embeds = self.image_proj_model(uncond_clip_image_embeds)
+        return image_prompt_embeds, uncond_image_prompt_embeds
+    def get_uncond_embeds(self):
+        clip_image = self.clip_image_processor(images=[Image.new("RGB", (128, 128))], return_tensors="pt").pixel_values
+        clip_image = clip_image.to(self.device, dtype=torch.float16)
+        uncond_clip_image_embeds = self.image_encoder(
+            torch.zeros_like(clip_image), output_hidden_states=True
+        ).hidden_states[-2]
+        uncond_image_prompt_embeds = self.image_proj_model(uncond_clip_image_embeds)
+        return uncond_image_prompt_embeds
+    def generate(
+        self,
+        pil_image=None,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        num_inference_steps=30,
+        image_prompt_embeds=None,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+        num_prompts = 1  # if isinstance(pil_image, Image.Image) else len(pil_image)
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+        if image_prompt_embeds is None:
+            image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(pil_image)
+        else:
+            uncond_image_prompt_embeds = self.get_uncond_embeds()
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        with torch.inference_mode():
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+                pooled_prompt_embeds,
+                negative_pooled_prompt_embeds,
+            ) = self.pipe.encode_prompt(
+                prompt,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_prompt_embeds], dim=1)
+        generator = get_generator(seed, self.device)
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            **kwargs,
+        ).images
+        return images

ip_adapter/ip_adapter_faceid.py ADDED Viewed

	@@ -0,0 +1,542 @@

+import os
+from typing import List
+import torch
+from diffusers import StableDiffusionPipeline
+from diffusers.pipelines.controlnet import MultiControlNetModel
+from PIL import Image
+from safetensors import safe_open
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from .attention_processor_faceid import LoRAAttnProcessor, LoRAIPAttnProcessor
+from .utils import is_torch2_available, get_generator
+USE_DAFAULT_ATTN = False # should be True for visualization_attnmap
+if is_torch2_available() and (not USE_DAFAULT_ATTN):
+    from .attention_processor_faceid import (
+        LoRAAttnProcessor2_0 as LoRAAttnProcessor,
+    )
+    from .attention_processor_faceid import (
+        LoRAIPAttnProcessor2_0 as LoRAIPAttnProcessor,
+    )
+else:
+    from .attention_processor_faceid import LoRAAttnProcessor, LoRAIPAttnProcessor
+from .resampler import PerceiverAttention, FeedForward
+class FacePerceiverResampler(torch.nn.Module):
+    def __init__(
+        self,
+        *,
+        dim=768,
+        depth=4,
+        dim_head=64,
+        heads=16,
+        embedding_dim=1280,
+        output_dim=768,
+        ff_mult=4,
+    ):
+        super().__init__()
+        self.proj_in = torch.nn.Linear(embedding_dim, dim)
+        self.proj_out = torch.nn.Linear(dim, output_dim)
+        self.norm_out = torch.nn.LayerNorm(output_dim)
+        self.layers = torch.nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                torch.nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+    def forward(self, latents, x):
+        x = self.proj_in(x)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        latents = self.proj_out(latents)
+        return self.norm_out(latents)
+class MLPProjModel(torch.nn.Module):
+    def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, num_tokens=4):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.num_tokens = num_tokens
+        self.proj = torch.nn.Sequential(
+            torch.nn.Linear(id_embeddings_dim, id_embeddings_dim*2),
+            torch.nn.GELU(),
+            torch.nn.Linear(id_embeddings_dim*2, cross_attention_dim*num_tokens),
+        )
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+    def forward(self, id_embeds):
+        x = self.proj(id_embeds)
+        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
+        x = self.norm(x)
+        return x
+class ProjPlusModel(torch.nn.Module):
+    def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, clip_embeddings_dim=1280, num_tokens=4):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.num_tokens = num_tokens
+        self.proj = torch.nn.Sequential(
+            torch.nn.Linear(id_embeddings_dim, id_embeddings_dim*2),
+            torch.nn.GELU(),
+            torch.nn.Linear(id_embeddings_dim*2, cross_attention_dim*num_tokens),
+        )
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+        self.perceiver_resampler = FacePerceiverResampler(
+            dim=cross_attention_dim,
+            depth=4,
+            dim_head=64,
+            heads=cross_attention_dim // 64,
+            embedding_dim=clip_embeddings_dim,
+            output_dim=cross_attention_dim,
+            ff_mult=4,
+        )
+    def forward(self, id_embeds, clip_embeds, shortcut=False, scale=1.0):
+        x = self.proj(id_embeds)
+        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
+        x = self.norm(x)
+        out = self.perceiver_resampler(x, clip_embeds)
+        if shortcut:
+            out = x + scale * out
+        return out
+class IPAdapterFaceID:
+    def __init__(self, sd_pipe, ip_ckpt, device, lora_rank=128, num_tokens=4, torch_dtype=torch.float16):
+        self.device = device
+        self.ip_ckpt = ip_ckpt
+        self.lora_rank = lora_rank
+        self.num_tokens = num_tokens
+        self.torch_dtype = torch_dtype
+        self.pipe = sd_pipe.to(self.device)
+        self.set_ip_adapter()
+        # image proj model
+        self.image_proj_model = self.init_proj()
+        self.load_ip_adapter()
+    def init_proj(self):
+        image_proj_model = MLPProjModel(
+            cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
+            id_embeddings_dim=512,
+            num_tokens=self.num_tokens,
+        ).to(self.device, dtype=self.torch_dtype)
+        return image_proj_model
+    def set_ip_adapter(self):
+        unet = self.pipe.unet
+        attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_procs[name] = LoRAAttnProcessor(
+                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=self.lora_rank,
+                ).to(self.device, dtype=self.torch_dtype)
+            else:
+                attn_procs[name] = LoRAIPAttnProcessor(
+                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0, rank=self.lora_rank, num_tokens=self.num_tokens,
+                ).to(self.device, dtype=self.torch_dtype)
+        unet.set_attn_processor(attn_procs)
+    def load_ip_adapter(self):
+        if os.path.splitext(self.ip_ckpt)[-1] == ".safetensors":
+            state_dict = {"image_proj": {}, "ip_adapter": {}}
+            with safe_open(self.ip_ckpt, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    if key.startswith("image_proj."):
+                        state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
+                    elif key.startswith("ip_adapter."):
+                        state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
+        else:
+            state_dict = torch.load(self.ip_ckpt, map_location="cpu")
+        self.image_proj_model.load_state_dict(state_dict["image_proj"])
+        ip_layers = torch.nn.ModuleList(self.pipe.unet.attn_processors.values())
+        ip_layers.load_state_dict(state_dict["ip_adapter"])
+    @torch.inference_mode()
+    def get_image_embeds(self, faceid_embeds):
+        faceid_embeds = faceid_embeds.to(self.device, dtype=self.torch_dtype)
+        image_prompt_embeds = self.image_proj_model(faceid_embeds)
+        uncond_image_prompt_embeds = self.image_proj_model(torch.zeros_like(faceid_embeds))
+        return image_prompt_embeds, uncond_image_prompt_embeds
+    def set_scale(self, scale):
+        for attn_processor in self.pipe.unet.attn_processors.values():
+            if isinstance(attn_processor, LoRAIPAttnProcessor):
+                attn_processor.scale = scale
+    def generate(
+        self,
+        faceid_embeds=None,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        guidance_scale=7.5,
+        num_inference_steps=30,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+        num_prompts = faceid_embeds.size(0)
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(faceid_embeds)
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        with torch.inference_mode():
+            prompt_embeds_, negative_prompt_embeds_ = self.pipe.encode_prompt(
+                prompt,
+                device=self.device,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds_, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds_, uncond_image_prompt_embeds], dim=1)
+        generator = get_generator(seed, self.device)
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            **kwargs,
+        ).images
+        return images
+class IPAdapterFaceIDPlus:
+    def __init__(self, sd_pipe, image_encoder_path, ip_ckpt, device, lora_rank=128, num_tokens=4, torch_dtype=torch.float16):
+        self.device = device
+        self.image_encoder_path = image_encoder_path
+        self.ip_ckpt = ip_ckpt
+        self.lora_rank = lora_rank
+        self.num_tokens = num_tokens
+        self.torch_dtype = torch_dtype
+        self.pipe = sd_pipe.to(self.device)
+        self.set_ip_adapter()
+        # load image encoder
+        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path).to(
+            self.device, dtype=self.torch_dtype
+        )
+        self.clip_image_processor = CLIPImageProcessor()
+        # image proj model
+        self.image_proj_model = self.init_proj()
+        self.load_ip_adapter()
+    def init_proj(self):
+        image_proj_model = ProjPlusModel(
+            cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
+            id_embeddings_dim=512,
+            clip_embeddings_dim=self.image_encoder.config.hidden_size,
+            num_tokens=self.num_tokens,
+        ).to(self.device, dtype=self.torch_dtype)
+        return image_proj_model
+    def set_ip_adapter(self):
+        unet = self.pipe.unet
+        attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_procs[name] = LoRAAttnProcessor(
+                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=self.lora_rank,
+                ).to(self.device, dtype=self.torch_dtype)
+            else:
+                attn_procs[name] = LoRAIPAttnProcessor(
+                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0, rank=self.lora_rank, num_tokens=self.num_tokens,
+                ).to(self.device, dtype=self.torch_dtype)
+        unet.set_attn_processor(attn_procs)
+    def load_ip_adapter(self):
+        if os.path.splitext(self.ip_ckpt)[-1] == ".safetensors":
+            state_dict = {"image_proj": {}, "ip_adapter": {}}
+            with safe_open(self.ip_ckpt, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    if key.startswith("image_proj."):
+                        state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
+                    elif key.startswith("ip_adapter."):
+                        state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
+        else:
+            state_dict = torch.load(self.ip_ckpt, map_location="cpu")
+        self.image_proj_model.load_state_dict(state_dict["image_proj"])
+        ip_layers = torch.nn.ModuleList(self.pipe.unet.attn_processors.values())
+        ip_layers.load_state_dict(state_dict["ip_adapter"])
+    @torch.inference_mode()
+    def get_image_embeds(self, faceid_embeds, face_image, s_scale, shortcut):
+        if isinstance(face_image, Image.Image):
+            pil_image = [face_image]
+        clip_image = self.clip_image_processor(images=face_image, return_tensors="pt").pixel_values
+        clip_image = clip_image.to(self.device, dtype=self.torch_dtype)
+        clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
+        uncond_clip_image_embeds = self.image_encoder(
+            torch.zeros_like(clip_image), output_hidden_states=True
+        ).hidden_states[-2]
+        faceid_embeds = faceid_embeds.to(self.device, dtype=self.torch_dtype)
+        image_prompt_embeds = self.image_proj_model(faceid_embeds, clip_image_embeds, shortcut=shortcut, scale=s_scale)
+        uncond_image_prompt_embeds = self.image_proj_model(torch.zeros_like(faceid_embeds), uncond_clip_image_embeds, shortcut=shortcut, scale=s_scale)
+        return image_prompt_embeds, uncond_image_prompt_embeds
+    def set_scale(self, scale):
+        for attn_processor in self.pipe.unet.attn_processors.values():
+            if isinstance(attn_processor, LoRAIPAttnProcessor):
+                attn_processor.scale = scale
+    def generate(
+        self,
+        face_image=None,
+        faceid_embeds=None,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        guidance_scale=7.5,
+        num_inference_steps=30,
+        s_scale=1.0,
+        shortcut=False,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+        num_prompts = faceid_embeds.size(0)
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(faceid_embeds, face_image, s_scale, shortcut)
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        with torch.inference_mode():
+            prompt_embeds_, negative_prompt_embeds_ = self.pipe.encode_prompt(
+                prompt,
+                device=self.device,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds_, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds_, uncond_image_prompt_embeds], dim=1)
+        generator = get_generator(seed, self.device)
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            **kwargs,
+        ).images
+        return images
+class IPAdapterFaceIDXL(IPAdapterFaceID):
+    """SDXL"""
+    def generate(
+        self,
+        faceid_embeds=None,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        num_inference_steps=30,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+        num_prompts = faceid_embeds.size(0)
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(faceid_embeds)
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        with torch.inference_mode():
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+                pooled_prompt_embeds,
+                negative_pooled_prompt_embeds,
+            ) = self.pipe.encode_prompt(
+                prompt,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_prompt_embeds], dim=1)
+        generator = get_generator(seed, self.device)
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            **kwargs,
+        ).images
+        return images
+class IPAdapterFaceIDPlusXL(IPAdapterFaceIDPlus):
+    """SDXL"""
+    def generate(
+        self,
+        face_image=None,
+        faceid_embeds=None,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        guidance_scale=7.5,
+        num_inference_steps=30,
+        s_scale=1.0,
+        shortcut=True,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+        num_prompts = faceid_embeds.size(0)
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(faceid_embeds, face_image, s_scale, shortcut)
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        with torch.inference_mode():
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+                pooled_prompt_embeds,
+                negative_pooled_prompt_embeds,
+            ) = self.pipe.encode_prompt(
+                prompt,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_prompt_embeds], dim=1)
+        generator = get_generator(seed, self.device)
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            guidance_scale=guidance_scale,
+            **kwargs,
+        ).images
+        return images

ip_adapter/ip_adapter_faceid_separate.py ADDED Viewed

	@@ -0,0 +1,556 @@

+import os
+from typing import List
+import torch
+from diffusers import StableDiffusionPipeline
+from diffusers.pipelines.controlnet import MultiControlNetModel
+from PIL import Image
+from safetensors import safe_open
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from .utils import is_torch2_available, get_generator
+USE_DAFAULT_ATTN = False # should be True for visualization_attnmap
+if is_torch2_available() and (not USE_DAFAULT_ATTN):
+    from .attention_processor import (
+        AttnProcessor2_0 as AttnProcessor,
+    )
+    from .attention_processor import (
+        IPAttnProcessor2_0 as IPAttnProcessor,
+    )
+else:
+    from .attention_processor import AttnProcessor, IPAttnProcessor
+from .resampler import PerceiverAttention, FeedForward
+class FacePerceiverResampler(torch.nn.Module):
+    def __init__(
+        self,
+        *,
+        dim=768,
+        depth=4,
+        dim_head=64,
+        heads=16,
+        embedding_dim=1280,
+        output_dim=768,
+        ff_mult=4,
+    ):
+        super().__init__()
+        self.proj_in = torch.nn.Linear(embedding_dim, dim)
+        self.proj_out = torch.nn.Linear(dim, output_dim)
+        self.norm_out = torch.nn.LayerNorm(output_dim)
+        self.layers = torch.nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                torch.nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+    def forward(self, latents, x):
+        x = self.proj_in(x)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        latents = self.proj_out(latents)
+        return self.norm_out(latents)
+class MLPProjModel(torch.nn.Module):
+    def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, num_tokens=4):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.num_tokens = num_tokens
+        self.proj = torch.nn.Sequential(
+            torch.nn.Linear(id_embeddings_dim, id_embeddings_dim*2),
+            torch.nn.GELU(),
+            torch.nn.Linear(id_embeddings_dim*2, cross_attention_dim*num_tokens),
+        )
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+    def forward(self, id_embeds):
+        x = self.proj(id_embeds)
+        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
+        x = self.norm(x)
+        return x
+class ProjPlusModel(torch.nn.Module):
+    def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, clip_embeddings_dim=1280, num_tokens=4):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.num_tokens = num_tokens
+        self.proj = torch.nn.Sequential(
+            torch.nn.Linear(id_embeddings_dim, id_embeddings_dim*2),
+            torch.nn.GELU(),
+            torch.nn.Linear(id_embeddings_dim*2, cross_attention_dim*num_tokens),
+        )
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+        self.perceiver_resampler = FacePerceiverResampler(
+            dim=cross_attention_dim,
+            depth=4,
+            dim_head=64,
+            heads=cross_attention_dim // 64,
+            embedding_dim=clip_embeddings_dim,
+            output_dim=cross_attention_dim,
+            ff_mult=4,
+        )
+    def forward(self, id_embeds, clip_embeds, shortcut=False, scale=1.0):
+        x = self.proj(id_embeds)
+        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
+        x = self.norm(x)
+        out = self.perceiver_resampler(x, clip_embeds)
+        if shortcut:
+            out = x + scale * out
+        return out
+class IPAdapterFaceID:
+    def __init__(self, sd_pipe, ip_ckpt, device, num_tokens=4, n_cond=1, torch_dtype=torch.float16):
+        self.device = device
+        self.ip_ckpt = ip_ckpt
+        self.num_tokens = num_tokens
+        self.n_cond = n_cond
+        self.torch_dtype = torch_dtype
+        self.pipe = sd_pipe.to(self.device)
+        self.set_ip_adapter()
+        # image proj model
+        self.image_proj_model = self.init_proj()
+        self.load_ip_adapter()
+    def init_proj(self):
+        image_proj_model = MLPProjModel(
+            cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
+            id_embeddings_dim=512,
+            num_tokens=self.num_tokens,
+        ).to(self.device, dtype=self.torch_dtype)
+        return image_proj_model
+    def set_ip_adapter(self):
+        unet = self.pipe.unet
+        attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_procs[name] = AttnProcessor()
+            else:
+                attn_procs[name] = IPAttnProcessor(
+                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0, num_tokens=self.num_tokens*self.n_cond,
+                ).to(self.device, dtype=self.torch_dtype)
+        unet.set_attn_processor(attn_procs)
+    def load_ip_adapter(self):
+        if os.path.splitext(self.ip_ckpt)[-1] == ".safetensors":
+            state_dict = {"image_proj": {}, "ip_adapter": {}}
+            with safe_open(self.ip_ckpt, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    if key.startswith("image_proj."):
+                        state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
+                    elif key.startswith("ip_adapter."):
+                        state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
+        else:
+            state_dict = torch.load(self.ip_ckpt, map_location="cpu")
+        self.image_proj_model.load_state_dict(state_dict["image_proj"])
+        ip_layers = torch.nn.ModuleList(self.pipe.unet.attn_processors.values())
+        ip_layers.load_state_dict(state_dict["ip_adapter"], strict=False)
+    @torch.inference_mode()
+    def get_image_embeds(self, faceid_embeds):
+        multi_face = False
+        if faceid_embeds.dim() == 3:
+            multi_face = True
+            b, n, c = faceid_embeds.shape
+            faceid_embeds = faceid_embeds.reshape(b*n, c)
+        faceid_embeds = faceid_embeds.to(self.device, dtype=self.torch_dtype)
+        image_prompt_embeds = self.image_proj_model(faceid_embeds)
+        uncond_image_prompt_embeds = self.image_proj_model(torch.zeros_like(faceid_embeds))
+        if multi_face:
+            c = image_prompt_embeds.size(-1)
+            image_prompt_embeds = image_prompt_embeds.reshape(b, -1, c)
+            uncond_image_prompt_embeds = uncond_image_prompt_embeds.reshape(b, -1, c)
+        return image_prompt_embeds, uncond_image_prompt_embeds
+    def set_scale(self, scale):
+        for attn_processor in self.pipe.unet.attn_processors.values():
+            if isinstance(attn_processor, IPAttnProcessor):
+                attn_processor.scale = scale
+    def generate(
+        self,
+        faceid_embeds=None,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        guidance_scale=7.5,
+        num_inference_steps=30,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+        num_prompts = faceid_embeds.size(0)
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        else:
+            faceid_embeds = faceid_embeds.repeat(num_samples, 1, 1)
+            num_samples = 1
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(faceid_embeds)
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        with torch.inference_mode():
+            prompt_embeds_, negative_prompt_embeds_ = self.pipe.encode_prompt(
+                prompt,
+                device=self.device,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds_, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds_, uncond_image_prompt_embeds], dim=1)
+        generator = get_generator(seed, self.device)
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            num_images_per_prompt=num_samples,
+            **kwargs,
+        ).images
+        return images
+class IPAdapterFaceIDPlus:
+    def __init__(self, sd_pipe, image_encoder_path, ip_ckpt, device, num_tokens=4, torch_dtype=torch.float16):
+        self.device = device
+        self.image_encoder_path = image_encoder_path
+        self.ip_ckpt = ip_ckpt
+        self.num_tokens = num_tokens
+        self.torch_dtype = torch_dtype
+        self.pipe = sd_pipe.to(self.device)
+        self.set_ip_adapter()
+        # load image encoder
+        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path).to(
+            self.device, dtype=self.torch_dtype
+        )
+        self.clip_image_processor = CLIPImageProcessor()
+        # image proj model
+        self.image_proj_model = self.init_proj()
+        self.load_ip_adapter()
+    def init_proj(self):
+        image_proj_model = ProjPlusModel(
+            cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
+            id_embeddings_dim=512,
+            clip_embeddings_dim=self.image_encoder.config.hidden_size,
+            num_tokens=self.num_tokens,
+        ).to(self.device, dtype=self.torch_dtype)
+        return image_proj_model
+    def set_ip_adapter(self):
+        unet = self.pipe.unet
+        attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_procs[name] = AttnProcessor()
+            else:
+                attn_procs[name] = IPAttnProcessor(
+                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0, num_tokens=self.num_tokens,
+                ).to(self.device, dtype=self.torch_dtype)
+        unet.set_attn_processor(attn_procs)
+    def load_ip_adapter(self):
+        if os.path.splitext(self.ip_ckpt)[-1] == ".safetensors":
+            state_dict = {"image_proj": {}, "ip_adapter": {}}
+            with safe_open(self.ip_ckpt, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    if key.startswith("image_proj."):
+                        state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
+                    elif key.startswith("ip_adapter."):
+                        state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
+        else:
+            state_dict = torch.load(self.ip_ckpt, map_location="cpu")
+        self.image_proj_model.load_state_dict(state_dict["image_proj"])
+        ip_layers = torch.nn.ModuleList(self.pipe.unet.attn_processors.values())
+        ip_layers.load_state_dict(state_dict["ip_adapter"], strict=False)
+    @torch.inference_mode()
+    def get_image_embeds(self, faceid_embeds, face_image, s_scale, shortcut):
+        if isinstance(face_image, Image.Image):
+            pil_image = [face_image]
+        clip_image = self.clip_image_processor(images=face_image, return_tensors="pt").pixel_values
+        clip_image = clip_image.to(self.device, dtype=self.torch_dtype)
+        clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
+        uncond_clip_image_embeds = self.image_encoder(
+            torch.zeros_like(clip_image), output_hidden_states=True
+        ).hidden_states[-2]
+        faceid_embeds = faceid_embeds.to(self.device, dtype=self.torch_dtype)
+        image_prompt_embeds = self.image_proj_model(faceid_embeds, clip_image_embeds, shortcut=shortcut, scale=s_scale)
+        uncond_image_prompt_embeds = self.image_proj_model(torch.zeros_like(faceid_embeds), uncond_clip_image_embeds, shortcut=shortcut, scale=s_scale)
+        return image_prompt_embeds, uncond_image_prompt_embeds
+    def set_scale(self, scale):
+        for attn_processor in self.pipe.unet.attn_processors.values():
+            if isinstance(attn_processor, LoRAIPAttnProcessor):
+                attn_processor.scale = scale
+    def generate(
+        self,
+        face_image=None,
+        faceid_embeds=None,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        guidance_scale=7.5,
+        num_inference_steps=30,
+        s_scale=1.0,
+        shortcut=False,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+        num_prompts = faceid_embeds.size(0)
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(faceid_embeds, face_image, s_scale, shortcut)
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        with torch.inference_mode():
+            prompt_embeds_, negative_prompt_embeds_ = self.pipe.encode_prompt(
+                prompt,
+                device=self.device,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds_, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds_, uncond_image_prompt_embeds], dim=1)
+        generator = get_generator(seed, self.device)
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            **kwargs,
+        ).images
+        return images
+class IPAdapterFaceIDXL(IPAdapterFaceID):
+    """SDXL"""
+    def generate(
+        self,
+        faceid_embeds=None,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        num_inference_steps=30,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+        num_prompts = faceid_embeds.size(0)
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        else:
+            faceid_embeds = faceid_embeds.repeat(num_samples, 1, 1)
+            num_samples = 1
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(faceid_embeds)
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        with torch.inference_mode():
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+                pooled_prompt_embeds,
+                negative_pooled_prompt_embeds,
+            ) = self.pipe.encode_prompt(
+                prompt,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_prompt_embeds], dim=1)
+        generator = get_generator(seed, self.device)
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            num_images_per_prompt=num_samples,
+            **kwargs,
+        ).images
+        return images
+class IPAdapterFaceIDPlusXL(IPAdapterFaceIDPlus):
+    """SDXL"""
+    def generate(
+        self,
+        face_image=None,
+        faceid_embeds=None,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        guidance_scale=7.5,
+        num_inference_steps=30,
+        s_scale=1.0,
+        shortcut=True,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+        num_prompts = faceid_embeds.size(0)
+        if prompt is None:
+            prompt = "best quality, high quality"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(faceid_embeds, face_image, s_scale, shortcut)
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        with torch.inference_mode():
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+                pooled_prompt_embeds,
+                negative_pooled_prompt_embeds,
+            ) = self.pipe.encode_prompt(
+                prompt,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_prompt_embeds], dim=1)
+        generator = get_generator(seed, self.device)
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            guidance_scale=guidance_scale,
+            **kwargs,
+        ).images
+        return images

ip_adapter/resampler.py ADDED Viewed

	@@ -0,0 +1,158 @@

+# modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py
+# and https://github.com/lucidrains/imagen-pytorch/blob/main/imagen_pytorch/imagen_pytorch.py
+import math
+import torch
+import torch.nn as nn
+from einops import rearrange
+from einops.layers.torch import Rearrange
+# FFN
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+def reshape_tensor(x, heads):
+    bs, length, width = x.shape
+    # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
+    x = x.view(bs, length, heads, -1)
+    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+    x = x.transpose(1, 2)
+    # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+    x = x.reshape(bs, heads, length, -1)
+    return x
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D)
+        """
+        x = self.norm1(x)
+        latents = self.norm2(latents)
+        b, l, _ = latents.shape
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        q = reshape_tensor(q, self.heads)
+        k = reshape_tensor(k, self.heads)
+        v = reshape_tensor(v, self.heads)
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = weight @ v
+        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+        return self.to_out(out)
+class Resampler(nn.Module):
+    def __init__(
+        self,
+        dim=1024,
+        depth=8,
+        dim_head=64,
+        heads=16,
+        num_queries=8,
+        embedding_dim=768,
+        output_dim=1024,
+        ff_mult=4,
+        max_seq_len: int = 257,  # CLIP tokens + CLS token
+        apply_pos_emb: bool = False,
+        num_latents_mean_pooled: int = 0,  # number of latents derived from mean pooled representation of the sequence
+    ):
+        super().__init__()
+        self.pos_emb = nn.Embedding(max_seq_len, embedding_dim) if apply_pos_emb else None
+        self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5)
+        self.proj_in = nn.Linear(embedding_dim, dim)
+        self.proj_out = nn.Linear(dim, output_dim)
+        self.norm_out = nn.LayerNorm(output_dim)
+        self.to_latents_from_mean_pooled_seq = (
+            nn.Sequential(
+                nn.LayerNorm(dim),
+                nn.Linear(dim, dim * num_latents_mean_pooled),
+                Rearrange("b (n d) -> b n d", n=num_latents_mean_pooled),
+            )
+            if num_latents_mean_pooled > 0
+            else None
+        )
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+    def forward(self, x):
+        if self.pos_emb is not None:
+            n, device = x.shape[1], x.device
+            pos_emb = self.pos_emb(torch.arange(n, device=device))
+            x = x + pos_emb
+        latents = self.latents.repeat(x.size(0), 1, 1)
+        x = self.proj_in(x)
+        if self.to_latents_from_mean_pooled_seq:
+            meanpooled_seq = masked_mean(x, dim=1, mask=torch.ones(x.shape[:2], device=x.device, dtype=torch.bool))
+            meanpooled_latents = self.to_latents_from_mean_pooled_seq(meanpooled_seq)
+            latents = torch.cat((meanpooled_latents, latents), dim=-2)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        latents = self.proj_out(latents)
+        return self.norm_out(latents)
+def masked_mean(t, *, dim, mask=None):
+    if mask is None:
+        return t.mean(dim=dim)
+    denom = mask.sum(dim=dim, keepdim=True)
+    mask = rearrange(mask, "b n -> b n 1")
+    masked_t = t.masked_fill(~mask, 0.0)
+    return masked_t.sum(dim=dim) / denom.clamp(min=1e-5)

ip_adapter/sd3_attention_processor.py ADDED Viewed

	@@ -0,0 +1,179 @@

+from typing import Callable, List, Optional, Union
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.models.attention_processor import Attention
+class JointAttnProcessor2_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        context_input_ndim = encoder_hidden_states.ndim
+        if context_input_ndim == 4:
+            batch_size, channel, height, width = encoder_hidden_states.shape
+            encoder_hidden_states = encoder_hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size = encoder_hidden_states.shape[0]
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        # `context` projections.
+        encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        # attention
+        query = torch.cat([query, encoder_hidden_states_query_proj], dim=1)
+        key = torch.cat([key, encoder_hidden_states_key_proj], dim=1)
+        value = torch.cat([value, encoder_hidden_states_value_proj], dim=1)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # Split the attention outputs.
+        hidden_states, encoder_hidden_states = (
+            hidden_states[:, : residual.shape[1]],
+            hidden_states[:, residual.shape[1] :],
+        )
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if not attn.context_pre_only:
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if context_input_ndim == 4:
+            encoder_hidden_states = encoder_hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        return hidden_states, encoder_hidden_states
+class IPJointAttnProcessor2_0(torch.nn.Module):
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+    def __init__(self, context_dim, hidden_dim, scale=1.0):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        super().__init__()
+        self.scale = scale
+        self.add_k_proj_ip = nn.Linear(context_dim, hidden_dim)
+        self.add_v_proj_ip = nn.Linear(context_dim, hidden_dim)
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        ip_hidden_states: torch.FloatTensor = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        context_input_ndim = encoder_hidden_states.ndim
+        if context_input_ndim == 4:
+            batch_size, channel, height, width = encoder_hidden_states.shape
+            encoder_hidden_states = encoder_hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size = encoder_hidden_states.shape[0]
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        sample_query = query # latent query
+        # `context` projections.
+        encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        # attention
+        query = torch.cat([query, encoder_hidden_states_query_proj], dim=1)
+        key = torch.cat([key, encoder_hidden_states_key_proj], dim=1)
+        value = torch.cat([value, encoder_hidden_states_value_proj], dim=1)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # Split the attention outputs.
+        hidden_states, encoder_hidden_states = (
+            hidden_states[:, : residual.shape[1]],
+            hidden_states[:, residual.shape[1] :],
+        )
+        # for ip-adapter
+        ip_key = self.add_k_proj_ip(ip_hidden_states)
+        ip_value = self.add_v_proj_ip(ip_hidden_states)
+        ip_query = sample_query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        ip_hidden_states = F.scaled_dot_product_attention(ip_query, ip_key, ip_value, dropout_p=0.0, is_causal=False)
+        ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        ip_hidden_states = ip_hidden_states.to(ip_query.dtype)
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if not attn.context_pre_only:
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if context_input_ndim == 4:
+            encoder_hidden_states = encoder_hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        return hidden_states, encoder_hidden_states

ip_adapter/test_resampler.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import torch
+from resampler import Resampler
+from transformers import CLIPVisionModel
+BATCH_SIZE = 2
+OUTPUT_DIM = 1280
+NUM_QUERIES = 8
+NUM_LATENTS_MEAN_POOLED = 4  # 0 for no mean pooling (previous behavior)
+APPLY_POS_EMB = True  # False for no positional embeddings (previous behavior)
+IMAGE_ENCODER_NAME_OR_PATH = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
+def main():
+    image_encoder = CLIPVisionModel.from_pretrained(IMAGE_ENCODER_NAME_OR_PATH)
+    embedding_dim = image_encoder.config.hidden_size
+    print(f"image_encoder hidden size: ", embedding_dim)
+    image_proj_model = Resampler(
+        dim=1024,
+        depth=2,
+        dim_head=64,
+        heads=16,
+        num_queries=NUM_QUERIES,
+        embedding_dim=embedding_dim,
+        output_dim=OUTPUT_DIM,
+        ff_mult=2,
+        max_seq_len=257,
+        apply_pos_emb=APPLY_POS_EMB,
+        num_latents_mean_pooled=NUM_LATENTS_MEAN_POOLED,
+    )
+    dummy_images = torch.randn(BATCH_SIZE, 3, 224, 224)
+    with torch.no_grad():
+        image_embeds = image_encoder(dummy_images, output_hidden_states=True).hidden_states[-2]
+    print("image_embds shape: ", image_embeds.shape)
+    with torch.no_grad():
+        ip_tokens = image_proj_model(image_embeds)
+    print("ip_tokens shape:", ip_tokens.shape)
+    assert ip_tokens.shape == (BATCH_SIZE, NUM_QUERIES + NUM_LATENTS_MEAN_POOLED, OUTPUT_DIM)
+if __name__ == "__main__":
+    main()

ip_adapter/utils.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import torch
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+attn_maps = {}
+def hook_fn(name):
+    def forward_hook(module, input, output):
+        if hasattr(module.processor, "attn_map"):
+            attn_maps[name] = module.processor.attn_map
+            del module.processor.attn_map
+    return forward_hook
+def register_cross_attention_hook(unet):
+    for name, module in unet.named_modules():
+        if name.split('.')[-1].startswith('attn2'):
+            module.register_forward_hook(hook_fn(name))
+    return unet
+def upscale(attn_map, target_size):
+    attn_map = torch.mean(attn_map, dim=0)
+    attn_map = attn_map.permute(1,0)
+    temp_size = None
+    for i in range(0,5):
+        scale = 2 ** i
+        if ( target_size[0] // scale ) * ( target_size[1] // scale) == attn_map.shape[1]*64:
+            temp_size = (target_size[0]//(scale*8), target_size[1]//(scale*8))
+            break
+    assert temp_size is not None, "temp_size cannot is None"
+    attn_map = attn_map.view(attn_map.shape[0], *temp_size)
+    attn_map = F.interpolate(
+        attn_map.unsqueeze(0).to(dtype=torch.float32),
+        size=target_size,
+        mode='bilinear',
+        align_corners=False
+    )[0]
+    attn_map = torch.softmax(attn_map, dim=0)
+    return attn_map
+def get_net_attn_map(image_size, batch_size=2, instance_or_negative=False, detach=True):
+    idx = 0 if instance_or_negative else 1
+    net_attn_maps = []
+    for name, attn_map in attn_maps.items():
+        attn_map = attn_map.cpu() if detach else attn_map
+        attn_map = torch.chunk(attn_map, batch_size)[idx].squeeze()
+        attn_map = upscale(attn_map, image_size)
+        net_attn_maps.append(attn_map)
+    net_attn_maps = torch.mean(torch.stack(net_attn_maps,dim=0),dim=0)
+    return net_attn_maps
+def attnmaps2images(net_attn_maps):
+    #total_attn_scores = 0
+    images = []
+    for attn_map in net_attn_maps:
+        attn_map = attn_map.cpu().numpy()
+        #total_attn_scores += attn_map.mean().item()
+        normalized_attn_map = (attn_map - np.min(attn_map)) / (np.max(attn_map) - np.min(attn_map)) * 255
+        normalized_attn_map = normalized_attn_map.astype(np.uint8)
+        #print("norm: ", normalized_attn_map.shape)
+        image = Image.fromarray(normalized_attn_map)
+        #image = fix_save_attn_map(attn_map)
+        images.append(image)
+    #print(total_attn_scores)
+    return images
+def is_torch2_available():
+    return hasattr(F, "scaled_dot_product_attention")
+def get_generator(seed, device):
+    if seed is not None:
+        if isinstance(seed, list):
+            generator = [torch.Generator(device).manual_seed(seed_item) for seed_item in seed]
+        else:
+            generator = torch.Generator(device).manual_seed(seed)
+    else:
+        generator = None
+    return generator

model/__init__.py ADDED Viewed

File without changes

model/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (187 Bytes). View file

model/__pycache__/dit.cpython-312.pyc ADDED Viewed

Binary file (19 kB). View file

model/__pycache__/pipeline_pit.cpython-312.pyc ADDED Viewed

Binary file (4.85 kB). View file

model/dit.py ADDED Viewed

	@@ -0,0 +1,313 @@

+# From the great https://github.com/cloneofsimo/minRF/blob/main/dit.py
+# Code heavily based on https://github.com/Alpha-VLLM/LLaMA2-Accessory
+# this is modeling code for DiT-LLaMA model
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers import ModelMixin, ConfigMixin
+from diffusers.configuration_utils import register_to_config
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+class TimestepEmbedder(nn.Module):
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        half = dim // 2
+        freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half) / half).to(t.device)
+        args = t[:, None] * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size).to(dtype=next(self.parameters()).dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class LabelEmbedder(nn.Module):
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = int(dropout_prob > 0)
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def token_drop(self, labels, force_drop_ids=None):
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0]) < self.dropout_prob
+            drop_ids = drop_ids.cuda()
+            drop_ids = drop_ids.to(labels.device)
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+class Attention(nn.Module):
+    def __init__(self, dim, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+        self.n_rep = 1
+        self.head_dim = dim // n_heads
+        self.wq = nn.Linear(dim, n_heads * self.head_dim, bias=False)
+        self.wk = nn.Linear(dim, self.n_heads * self.head_dim, bias=False)
+        self.wv = nn.Linear(dim, self.n_heads * self.head_dim, bias=False)
+        self.wo = nn.Linear(n_heads * self.head_dim, dim, bias=False)
+        self.q_norm = nn.LayerNorm(self.n_heads * self.head_dim)
+        self.k_norm = nn.LayerNorm(self.n_heads * self.head_dim)
+    @staticmethod
+    def reshape_for_broadcast(freqs_cis, x):
+        ndim = x.ndim
+        assert 0 <= 1 < ndim
+        # assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+        _freqs_cis = freqs_cis[: x.shape[1]]
+        shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        return _freqs_cis.view(*shape)
+    @staticmethod
+    def apply_rotary_emb(xq, xk, freqs_cis):
+        xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+        xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+        freqs_cis_xq = Attention.reshape_for_broadcast(freqs_cis, xq_)
+        freqs_cis_xk = Attention.reshape_for_broadcast(freqs_cis, xk_)
+        xq_out = torch.view_as_real(xq_ * freqs_cis_xq).flatten(3)
+        xk_out = torch.view_as_real(xk_ * freqs_cis_xk).flatten(3)
+        return xq_out, xk_out
+    def forward(self, x, freqs_cis):
+        bsz, seqlen, _ = x.shape
+        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+        dtype = xq.dtype
+        xq = self.q_norm(xq)
+        xk = self.k_norm(xk)
+        xq = xq.view(bsz, seqlen, self.n_heads, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_heads, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_heads, self.head_dim)
+        xq, xk = self.apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
+        xq, xk = xq.to(dtype), xk.to(dtype)
+        output = F.scaled_dot_product_attention(
+            xq.permute(0, 2, 1, 3),
+            xk.permute(0, 2, 1, 3),
+            xv.permute(0, 2, 1, 3),
+            dropout_p=0.0,
+            is_causal=False,
+        ).permute(0, 2, 1, 3)
+        output = output.flatten(-2)
+        return self.wo(output)
+class FeedForward(nn.Module):
+    def __init__(self, dim, hidden_dim, multiple_of, ffn_dim_multiplier=None):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        if ffn_dim_multiplier:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+    def _forward_silu_gating(self, x1, x3):
+        return F.silu(x1) * x3
+    def forward(self, x):
+        return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x)))
+class TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        layer_id,
+        dim,
+        n_heads,
+        multiple_of,
+        ffn_dim_multiplier,
+        norm_eps,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.head_dim = dim // n_heads
+        self.attention = Attention(dim, n_heads)
+        self.feed_forward = FeedForward(
+            dim=dim,
+            hidden_dim=4 * dim,
+            multiple_of=multiple_of,
+            ffn_dim_multiplier=ffn_dim_multiplier,
+        )
+        self.layer_id = layer_id
+        self.attention_norm = nn.LayerNorm(dim, eps=norm_eps)
+        self.ffn_norm = nn.LayerNorm(dim, eps=norm_eps)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(min(dim, 1024), 6 * dim, bias=True),
+        )
+    def forward(self, x, freqs_cis, adaln_input=None):
+        if adaln_input is not None:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).chunk(
+                6, dim=1
+            )
+            x = x + gate_msa.unsqueeze(1) * self.attention(
+                modulate(self.attention_norm(x), shift_msa, scale_msa), freqs_cis
+            )
+            x = x + gate_mlp.unsqueeze(1) * self.feed_forward(modulate(self.ffn_norm(x), shift_mlp, scale_mlp))
+        else:
+            x = x + self.attention(self.attention_norm(x), freqs_cis)
+            x = x + self.feed_forward(self.ffn_norm(x))
+        return x
+class FinalLayer(nn.Module):
+    def __init__(self, hidden_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(min(hidden_size, 1024), 2 * hidden_size, bias=True),
+        )
+        # # init zero
+        nn.init.constant_(self.linear.weight, 0)
+        nn.init.constant_(self.linear.bias, 0)
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class DiT_Llama(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        embedding_dim=3,
+        hidden_dim=512,
+        n_layers=5,
+        n_heads=16,
+        multiple_of=256,
+        ffn_dim_multiplier=None,
+        norm_eps=1e-5,
+    ):
+        super().__init__()
+        self.in_channels = embedding_dim
+        self.out_channels = embedding_dim
+        self.x_embedder = nn.Linear(embedding_dim, hidden_dim, bias=True)
+        nn.init.constant_(self.x_embedder.bias, 0)
+        self.t_embedder = TimestepEmbedder(min(hidden_dim, 1024))
+        # self.y_embedder = LabelEmbedder(num_classes, min(dim, 1024), class_dropout_prob)
+        self.layers = nn.ModuleList(
+            [
+                TransformerBlock(
+                    layer_id,
+                    hidden_dim,
+                    n_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                )
+                for layer_id in range(n_layers)
+            ]
+        )
+        self.final_layer = FinalLayer(hidden_dim, self.out_channels)
+        self.freqs_cis = DiT_Llama.precompute_freqs_cis(hidden_dim // n_heads, 4096)
+    def forward(self, x, t, cond):
+        self.freqs_cis = self.freqs_cis.to(x.device)
+        x = torch.cat([x, cond], dim=1)
+        x = self.x_embedder(x)
+        t = self.t_embedder(t)  # (N, D)
+        adaln_input = t.to(x.dtype)
+        for layer in self.layers:
+            x = layer(x, self.freqs_cis[: x.size(1)], adaln_input=adaln_input)
+        x = self.final_layer(x, adaln_input)
+        # Drop the cond part
+        x = x[:, : -cond.size(1)]
+        return x
+    def forward_with_cfg(self, x, t, cond, cfg_scale):
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        model_out = self.forward(combined, t, cond)
+        eps, rest = model_out[:, : self.in_channels], model_out[:, self.in_channels :]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        return torch.cat([eps, rest], dim=1)
+    @staticmethod
+    def precompute_freqs_cis(dim, end, theta=10000.0):
+        freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+        t = torch.arange(end)
+        freqs = torch.outer(t, freqs).float()
+        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+        return freqs_cis
+def DiT_base(**kwargs):
+    return DiT_Llama(in_dim=2048, hidden_dim=2048, n_layers=8, n_heads=32, **kwargs)
+if __name__ == "__main__":
+    model = DiT_Llama_600M_patch2()
+    model.eval()
+    x = torch.randn(2, 3, 32, 32)
+    t = torch.randint(0, 100, (2,))
+    y = torch.randint(0, 10, (2,))
+    with torch.no_grad():
+        out = model(x, t, y)
+        print(out.shape)
+        out = model.forward_with_cfg(x, t, y, 0.5)
+        print(out.shape)

model/pipeline_pit.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import math
+from typing import List, Optional, Union
+import torch
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import BaseOutput
+from diffusers.utils import (
+    logging,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from dataclasses import dataclass
+from model.dit import DiT_Llama
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class PiTPipelineOutput(BaseOutput):
+    image_embeds: torch.Tensor
+class PiTPipeline(DiffusionPipeline):
+    def __init__(self, prior: DiT_Llama):
+        super().__init__()
+        self.register_modules(
+            prior=prior,
+        )
+    def prepare_latents(self, shape, dtype, device, generator, latents):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+        return latents
+    @torch.no_grad()
+    def __call__(
+        self,
+        cond_sequence: torch.FloatTensor,
+        negative_cond_sequence: torch.FloatTensor,
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        init_latents: Optional[torch.FloatTensor] = None,
+        strength: Optional[float] = None,
+        guidance_scale: float = 1.0,
+        output_type: Optional[str] = "pt",  # pt only
+        return_dict: bool = True,
+    ):
+        do_classifier_free_guidance = guidance_scale > 1.0
+        device = self._execution_device
+        batch_size = cond_sequence.shape[0]
+        batch_size = batch_size * num_images_per_prompt
+        embedding_dim = self.prior.config.embedding_dim
+        latents = self.prepare_latents(
+            (batch_size, 16, embedding_dim),
+            self.prior.dtype,
+            device,
+            generator,
+            latents,
+        )
+        if init_latents is not None:
+            init_latents = init_latents.to(latents.device)
+            latents = (strength) * latents + (1 - strength) * init_latents
+        # Rectified Flow
+        dt = 1.0 / num_inference_steps
+        dt = torch.tensor([dt] * batch_size).to(latents.device).view([batch_size, *([1] * len(latents.shape[1:]))])
+        start_inference_step = (
+            math.ceil(num_inference_steps * (strength)) if strength is not None else num_inference_steps
+        )
+        for i in range(start_inference_step, 0, -1):
+            t = i / num_inference_steps
+            t = torch.tensor([t] * batch_size).to(latents.device)
+            vc = self.prior(latents, t, cond_sequence)
+            if do_classifier_free_guidance:
+                vu = self.prior(latents, t, negative_cond_sequence)
+                vc = vu + guidance_scale * (vc - vu)
+            latents = latents - dt * vc
+        image_embeddings = latents
+        if output_type not in ["pt", "np"]:
+            raise ValueError(f"Only the output types `pt` and `np` are supported not output_type={output_type}")
+        if output_type == "np":
+            image_embeddings = image_embeddings.cpu().numpy()
+        if not return_dict:
+            return image_embeddings
+        return PiTPipelineOutput(image_embeds=image_embeddings)

pit.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import random
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import pyrallis
+import torch
+from diffusers import (
+    StableDiffusionXLPipeline,
+)
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from ip_adapter import IPAdapterPlusXL
+from model.dit import DiT_Llama
+from model.pipeline_pit import PiTPipeline
+from training.train_config import TrainConfig
+def paste_on_background(image, background, min_scale=0.4, max_scale=0.8, scale=None):
+    # Calculate aspect ratio and determine resizing based on the smaller dimension of the background
+    aspect_ratio = image.width / image.height
+    scale = random.uniform(min_scale, max_scale) if scale is None else scale
+    new_width = int(min(background.width, background.height * aspect_ratio) * scale)
+    new_height = int(new_width / aspect_ratio)
+    # Resize image and calculate position
+    image = image.resize((new_width, new_height), resample=Image.LANCZOS)
+    pos_x = random.randint(0, background.width - new_width)
+    pos_y = random.randint(0, background.height - new_height)
+    # Paste the image using its alpha channel as mask if present
+    background.paste(image, (pos_x, pos_y), image if "A" in image.mode else None)
+    return background
+def set_seed(seed: int):
+    """Ensures reproducibility across multiple libraries."""
+    random.seed(seed)  # Python random module
+    np.random.seed(seed)  # NumPy random module
+    torch.manual_seed(seed)  # PyTorch CPU random seed
+    torch.cuda.manual_seed_all(seed)  # PyTorch GPU random seed
+    torch.backends.cudnn.deterministic = True  # Ensures deterministic behavior
+    torch.backends.cudnn.benchmark = False  # Disable benchmarking to avoid randomness
+class PiTDemoPipeline:
+    def __init__(self, prior_repo: str, prior_path: str):
+        # Download model and config
+        prior_ckpt_path = hf_hub_download(
+            repo_id=prior_repo,
+            filename=str(prior_path),
+            local_dir="pretrained_models",
+        )
+        prior_cfg_path = hf_hub_download(
+            repo_id=prior_repo, filename=str(Path(prior_path).parent / "cfg.yaml"), local_dir="pretrained_models"
+        )
+        self.model_cfg: TrainConfig = pyrallis.load(TrainConfig, open(prior_cfg_path, "r"))
+        self.weight_dtype = torch.float32
+        self.device = "cuda:0"
+        prior = DiT_Llama(
+            embedding_dim=2048,
+            hidden_dim=self.model_cfg.hidden_dim,
+            n_layers=self.model_cfg.num_layers,
+            n_heads=self.model_cfg.num_attention_heads,
+        )
+        prior.load_state_dict(torch.load(prior_ckpt_path))
+        image_pipe = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            torch_dtype=torch.float16,
+            add_watermarker=False,
+        )
+        ip_ckpt_path = hf_hub_download(
+            repo_id="h94/IP-Adapter",
+            filename="ip-adapter-plus_sdxl_vit-h.bin",
+            subfolder="sdxl_models",
+            local_dir="pretrained_models",
+        )
+        self.ip_model = IPAdapterPlusXL(
+            image_pipe,
+            "models/image_encoder",
+            ip_ckpt_path,
+            self.device,
+            num_tokens=16,
+        )
+        self.image_processor = self.ip_model.clip_image_processor
+        empty_image = Image.new("RGB", (256, 256), (255, 255, 255))
+        zero_image = torch.Tensor(self.image_processor(empty_image)["pixel_values"][0])
+        self.zero_image_embeds = self.ip_model.get_image_embeds(zero_image.unsqueeze(0), skip_uncond=True)
+        prior_pipeline = PiTPipeline(
+            prior=prior,
+        )
+        self.prior_pipeline = prior_pipeline.to(self.device)
+        set_seed(42)
+    def run(self, crops_paths: list[str], scale: float = 2.0, seed: Optional[int] = None, n_images: int = 1):
+        if seed is not None:
+            set_seed(seed)
+        processed_crops = []
+        input_images = []
+        crops_paths = [None] + crops_paths
+        # Extend to >3 with Nones
+        while len(crops_paths) < 3:
+            crops_paths.append(None)
+        for path_ind, path in enumerate(crops_paths):
+            if path is None:
+                image = Image.new("RGB", (224, 224), (255, 255, 255))
+            else:
+                image = Image.open(path).convert("RGB")
+                if path_ind > 0 or not self.model_cfg.use_ref:
+                    background = Image.new("RGB", (1024, 1024), (255, 255, 255))
+                    image = paste_on_background(image, background, scale=0.92)
+                else:
+                    image = image.resize((1024, 1024))
+                input_images.append(image)
+                # Name should be parent directory name
+            processed_image = (
+                torch.Tensor(self.image_processor(image)["pixel_values"][0])
+                .to(self.device)
+                .unsqueeze(0)
+                .to(self.weight_dtype)
+            )
+            processed_crops.append(processed_image)
+        image_embed_inputs = []
+        for crop_ind in range(len(processed_crops)):
+            image_embed_inputs.append(self.ip_model.get_image_embeds(processed_crops[crop_ind], skip_uncond=True))
+        crops_input_sequence = torch.cat(image_embed_inputs, dim=1)
+        generated_images = []
+        for _ in range(n_images):
+            seed = random.randint(0, 1000000)
+            for curr_scale in [scale]:
+                negative_cond_sequence = torch.zeros_like(crops_input_sequence)
+                embeds_len = self.zero_image_embeds.shape[1]
+                for i in range(0, negative_cond_sequence.shape[1], embeds_len):
+                    negative_cond_sequence[:, i : i + embeds_len] = self.zero_image_embeds.detach()
+                img_emb = self.prior_pipeline(
+                    cond_sequence=crops_input_sequence,
+                    negative_cond_sequence=negative_cond_sequence,
+                    num_inference_steps=25,
+                    num_images_per_prompt=1,
+                    guidance_scale=curr_scale,
+                    generator=torch.Generator(device="cuda").manual_seed(seed),
+                ).image_embeds
+                for seed_2 in range(1):
+                    images = self.ip_model.generate(
+                        image_prompt_embeds=img_emb,
+                        num_samples=1,
+                        num_inference_steps=50,
+                    )
+                    generated_images += images
+        return generated_images

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+accelerate
+diffusers
+einops
+pyrallis
+torch
+transformers

training/__init__.py ADDED Viewed

File without changes

training/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (190 Bytes). View file

training/__pycache__/train_config.cpython-312.pyc ADDED Viewed

Binary file (2.31 kB). View file

training/coach.py ADDED Viewed

	@@ -0,0 +1,409 @@

+import random
+import sys
+from pathlib import Path
+import diffusers
+import pyrallis
+import torch
+import torch.utils.checkpoint
+import transformers
+from PIL import Image
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from diffusers import StableDiffusionXLPipeline
+from huggingface_hub import hf_hub_download
+from torchvision import transforms
+from tqdm import tqdm
+from ip_adapter import IPAdapterPlusXL
+from model.dit import DiT_Llama
+from model.pipeline_pit import PiTPipeline
+from training.dataset import (
+    PartsDataset,
+)
+from training.train_config import TrainConfig
+from utils import vis_utils
+logger = get_logger(__name__, log_level="INFO")
+class Coach:
+    def __init__(self, config: TrainConfig):
+        self.cfg = config
+        self.cfg.output_dir.mkdir(exist_ok=True, parents=True)
+        (self.cfg.output_dir / "cfg.yaml").write_text(pyrallis.dump(self.cfg))
+        (self.cfg.output_dir / "run.sh").write_text(f'python {Path(__file__).name} {" ".join(sys.argv)}')
+        self.logging_dir = self.cfg.output_dir / "logs"
+        accelerator_project_config = ProjectConfiguration(
+            total_limit=2, project_dir=self.cfg.output_dir, logging_dir=self.logging_dir
+        )
+        self.accelerator = Accelerator(
+            gradient_accumulation_steps=self.cfg.gradient_accumulation_steps,
+            mixed_precision=self.cfg.mixed_precision,
+            log_with=self.cfg.report_to,
+            project_config=accelerator_project_config,
+        )
+        self.device = "cuda"
+        logger.info(self.accelerator.state, main_process_only=False)
+        if self.accelerator.is_local_main_process:
+            transformers.utils.logging.set_verbosity_warning()
+            diffusers.utils.logging.set_verbosity_info()
+        else:
+            transformers.utils.logging.set_verbosity_error()
+            diffusers.utils.logging.set_verbosity_error()
+        if self.cfg.seed is not None:
+            set_seed(self.cfg.seed)
+        if self.accelerator.is_main_process:
+            self.logging_dir.mkdir(exist_ok=True, parents=True)
+        self.weight_dtype = torch.float32
+        if self.accelerator.mixed_precision == "fp16":
+            self.weight_dtype = torch.float16
+        elif self.accelerator.mixed_precision == "bf16":
+            self.weight_dtype = torch.bfloat16
+        self.prior = DiT_Llama(
+            embedding_dim=2048,
+            hidden_dim=self.cfg.hidden_dim,
+            n_layers=self.cfg.num_layers,
+            n_heads=self.cfg.num_attention_heads,
+        )
+        # pretty print total number of parameters in Billions
+        num_params = sum(p.numel() for p in self.prior.parameters())
+        print(f"Number of parameters: {num_params / 1e9:.2f}B")
+        self.image_pipe = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            torch_dtype=torch.float16,
+            add_watermarker=False,
+        ).to(self.device)
+        ip_ckpt_path = hf_hub_download(
+            repo_id="h94/IP-Adapter",
+            filename="ip-adapter-plus_sdxl_vit-h.bin",
+            subfolder="sdxl_models",
+            local_dir="pretrained_models",
+        )
+        self.ip_model = IPAdapterPlusXL(
+            self.image_pipe,
+            "models/image_encoder",
+            ip_ckpt_path,
+            self.device,
+            num_tokens=16,
+        )
+        self.image_processor = self.ip_model.clip_image_processor
+        empty_image = Image.new("RGB", (256, 256), (255, 255, 255))
+        zero_image = torch.Tensor(self.image_processor(empty_image)["pixel_values"][0])
+        self.zero_image_embeds = self.ip_model.get_image_embeds(zero_image.unsqueeze(0), skip_uncond=True)
+        self.prior_pipeline = PiTPipeline(prior=self.prior)
+        self.prior_pipeline = self.prior_pipeline.to(self.accelerator.device)
+        params_to_optimize = list(self.prior.parameters())
+        self.optimizer = torch.optim.AdamW(
+            params_to_optimize,
+            lr=self.cfg.lr,
+            betas=(self.cfg.adam_beta1, self.cfg.adam_beta2),
+            weight_decay=self.cfg.adam_weight_decay,
+            eps=self.cfg.adam_epsilon,
+        )
+        self.train_dataloader, self.validation_dataloader = self.get_dataloaders()
+        self.prior, self.optimizer, self.train_dataloader = self.accelerator.prepare(
+            self.prior, self.optimizer, self.train_dataloader
+        )
+        self.train_step = 0 if self.cfg.resume_from_step is None else self.cfg.resume_from_step
+        print(self.train_step)
+        if self.cfg.resume_from_path is not None:
+            prior_state_dict = torch.load(self.cfg.resume_from_path, map_location=self.device)
+            msg = self.prior.load_state_dict(prior_state_dict, strict=False)
+            print(msg)
+    def save_model(self, save_path):
+        save_path.mkdir(exist_ok=True, parents=True)
+        prior_state_dict = self.prior.state_dict()
+        torch.save(prior_state_dict, save_path / "prior.ckpt")
+    def unnormalize_and_pil(self, tensor):
+        unnormed = tensor * torch.tensor(self.image_processor.image_std).view(3, 1, 1).to(tensor.device) + torch.tensor(
+            self.image_processor.image_mean
+        ).view(3, 1, 1).to(tensor.device)
+        return transforms.ToPILImage()(unnormed)
+    def save_images(self, image, conds, cond_sequence, target_embeds, label="", save_path=""):
+        self.prior.eval()
+        input_images = []
+        captions = []
+        for i in range(len(conds)):
+            pil_image = self.unnormalize_and_pil(conds[i]).resize((self.cfg.img_size, self.cfg.img_size))
+            input_images.append(pil_image)
+            captions.append("Condition")
+        if image is not None:
+            input_images.append(self.unnormalize_and_pil(image).resize((self.cfg.img_size, self.cfg.img_size)))
+            captions.append(f"Target {label}")
+        seeds = range(2)
+        output_images = []
+        embebds_to_vis = []
+        embeds_captions = []
+        embebds_to_vis += [target_embeds]
+        embeds_captions += ["Target Reconstruct" if image is not None else "Source Reconstruct"]
+        if self.cfg.use_ref:
+            embebds_to_vis += [cond_sequence[:, :16]]
+            embeds_captions += ["Grid Reconstruct"]
+        for embs in embebds_to_vis:
+            direct_from_emb = self.ip_model.generate(image_prompt_embeds=embs, num_samples=1, num_inference_steps=50)
+            output_images = output_images + direct_from_emb
+        captions += embeds_captions
+        for seed in seeds:
+            for scale in [1, 4]:
+                negative_cond_sequence = torch.zeros_like(cond_sequence)
+                embeds_len = self.zero_image_embeds.shape[1]
+                for i in range(0, negative_cond_sequence.shape[1], embeds_len):
+                    negative_cond_sequence[:, i : i + embeds_len] = self.zero_image_embeds.detach()
+                img_emb = self.prior_pipeline(
+                    cond_sequence=cond_sequence,
+                    negative_cond_sequence=negative_cond_sequence,
+                    num_inference_steps=25,
+                    num_images_per_prompt=1,
+                    guidance_scale=scale,
+                    generator=torch.Generator(device="cuda").manual_seed(seed),
+                ).image_embeds
+                for seed_2 in range(1):
+                    images = self.ip_model.generate(
+                        image_prompt_embeds=img_emb,
+                        num_samples=1,
+                        num_inference_steps=50,
+                    )
+                    output_images += images
+                    captions.append(f"prior_s {seed}, cfg {scale}, unet_s {seed_2}")
+        all_images = input_images + output_images
+        gen_images = vis_utils.create_table_plot(images=all_images, captions=captions)
+        gen_images.save(save_path)
+        self.prior.train()
+    def get_dataloaders(self) -> torch.utils.data.DataLoader:
+        dataset_path = self.cfg.dataset_path
+        if not isinstance(self.cfg.dataset_path, list):
+            dataset_path = [self.cfg.dataset_path]
+        datasets = []
+        for path in dataset_path:
+            datasets.append(
+                PartsDataset(
+                    dataset_dir=path,
+                    image_processor=self.image_processor,
+                    use_ref=self.cfg.use_ref,
+                    max_crops=self.cfg.max_crops,
+                    sketch_prob=self.cfg.sketch_prob,
+                )
+            )
+        dataset = torch.utils.data.ConcatDataset(datasets)
+        print(f"Total number of samples: {len(dataset)}")
+        dataset_weights = []
+        for single_dataset in datasets:
+            dataset_weights.extend([len(dataset) / len(single_dataset)] * len(single_dataset))
+        sampler_train = torch.utils.data.WeightedRandomSampler(
+            weights=dataset_weights, num_samples=len(dataset_weights)
+        )
+        validation_dataset = PartsDataset(
+            dataset_dir=self.cfg.val_dataset_path,
+            image_processor=self.image_processor,
+            use_ref=self.cfg.use_ref,
+            max_crops=self.cfg.max_crops,
+            sketch_prob=self.cfg.sketch_prob,
+        )
+        train_dataloader = torch.utils.data.DataLoader(
+            dataset,
+            batch_size=self.cfg.train_batch_size,
+            shuffle=sampler_train is None,
+            num_workers=self.cfg.num_workers,
+            sampler=sampler_train,
+        )
+        validation_dataloader = torch.utils.data.DataLoader(
+            validation_dataset,
+            batch_size=1,
+            shuffle=True,
+            num_workers=self.cfg.num_workers,
+        )
+        return train_dataloader, validation_dataloader
+    def train(self):
+        pbar = tqdm(range(self.train_step, self.cfg.max_train_steps + 1))
+        # self.log_validation()
+        while self.train_step < self.cfg.max_train_steps:
+            train_loss = 0.0
+            self.prior.train()
+            lossbin = {i: 0 for i in range(10)}
+            losscnt = {i: 1e-6 for i in range(10)}
+            for sample_idx, batch in enumerate(self.train_dataloader):
+                with self.accelerator.accumulate(self.prior):
+                    image, cond = batch
+                    image = image.to(self.weight_dtype).to(self.accelerator.device)
+                    if "crops" in cond:
+                        for crop_ind in range(len(cond["crops"])):
+                            cond["crops"][crop_ind] = (
+                                cond["crops"][crop_ind].to(self.weight_dtype).to(self.accelerator.device)
+                            )
+                    for key in cond.keys():
+                        if isinstance(cond[key], torch.Tensor):
+                            cond[key] = cond[key].to(self.accelerator.device)
+                    with torch.no_grad():
+                        image_embeds = self.ip_model.get_image_embeds(image, skip_uncond=True)
+                        b = image_embeds.size(0)
+                        nt = torch.randn((b,)).to(image_embeds.device)
+                        t = torch.sigmoid(nt)
+                        texp = t.view([b, *([1] * len(image_embeds.shape[1:]))])
+                        z_1 = torch.randn_like(image_embeds)
+                        noisy_latents = (1 - texp) * image_embeds + texp * z_1
+                        target = image_embeds
+                        # At some prob uniformly sample across the entire batch so the model also learns to work with unpadded inputs
+                        if random.random() < 0.5:
+                            crops_to_keep = random.randint(1, len(cond["crops"]))
+                            cond["crops"] = cond["crops"][:crops_to_keep]
+                        cond_crops = cond["crops"]
+                        image_embed_inputs = []
+                        for crop_ind in range(len(cond_crops)):
+                            image_embed_inputs.append(
+                                self.ip_model.get_image_embeds(cond_crops[crop_ind], skip_uncond=True)
+                            )
+                        input_sequence = torch.cat(image_embed_inputs, dim=1)
+                    loss = 0
+                    image_feat_seq = input_sequence
+                    model_pred = self.prior(
+                        noisy_latents,
+                        t=t,
+                        cond=image_feat_seq,
+                    )
+                    batchwise_prior_loss = ((z_1 - target.float() - model_pred.float()) ** 2).mean(
+                        dim=list(range(1, len(target.shape)))
+                    )
+                    tlist = batchwise_prior_loss.detach().cpu().reshape(-1).tolist()
+                    ttloss = [(tv, tloss) for tv, tloss in zip(t, tlist)]
+                    # count based on t
+                    for t, l in ttloss:
+                        lossbin[int(t * 10)] += l
+                        losscnt[int(t * 10)] += 1
+                    loss += batchwise_prior_loss.mean()
+                    # Gather the losses across all processes for logging (if we use distributed training).
+                    avg_loss = self.accelerator.gather(loss.repeat(self.cfg.train_batch_size)).mean()
+                    train_loss += avg_loss.item() / self.cfg.gradient_accumulation_steps
+                    # Backprop
+                    self.accelerator.backward(loss)
+                    if self.accelerator.sync_gradients:
+                        self.accelerator.clip_grad_norm_(self.prior.parameters(), self.cfg.max_grad_norm)
+                    self.optimizer.step()
+                    self.optimizer.zero_grad()
+                # Checks if the accelerator has performed an optimization step behind the scenes
+                if self.accelerator.sync_gradients:
+                    pbar.update(1)
+                    self.train_step += 1
+                    train_loss = 0.0
+                    if self.accelerator.is_main_process:
+                        if self.train_step % self.cfg.checkpointing_steps == 1:
+                            if self.accelerator.is_main_process:
+                                save_path = self.cfg.output_dir  # / f"learned_prior.pth"
+                                self.save_model(save_path)
+                                logger.info(f"Saved state to {save_path}")
+                        pbar.set_postfix(**{"loss": loss.cpu().detach().item()})
+                        if self.cfg.log_image_frequency > 0 and (self.train_step % self.cfg.log_image_frequency == 1):
+                            image_save_path = self.cfg.output_dir / "images" / f"{self.train_step}_step_images.jpg"
+                            image_save_path.parent.mkdir(exist_ok=True, parents=True)
+                            # Apply the full diffusion process
+                            conds_list = []
+                            for crop_ind in range(len(cond["crops"])):
+                                conds_list.append(cond["crops"][crop_ind][0])
+                            self.save_images(
+                                image=image[0],
+                                conds=conds_list,
+                                cond_sequence=image_feat_seq[:1],
+                                target_embeds=target[:1],
+                                save_path=image_save_path,
+                            )
+                    if self.cfg.log_validation > 0 and (self.train_step % self.cfg.log_validation == 0):
+                        # Run validation
+                        self.log_validation()
+                    if self.train_step >= self.cfg.max_train_steps:
+                        break
+            self.train_dataloader, self.validation_dataloader = self.get_dataloaders()
+        pbar.close()
+    def log_validation(self):
+        for sample_idx, batch in tqdm(enumerate(self.validation_dataloader)):
+            image, cond = batch
+            image = image.to(self.weight_dtype).to(self.accelerator.device)
+            if "crops" in cond:
+                for crop_ind in range(len(cond["crops"])):
+                    cond["crops"][crop_ind] = cond["crops"][crop_ind].to(self.weight_dtype).to(self.accelerator.device)
+            for key in cond.keys():
+                if isinstance(cond[key], torch.Tensor):
+                    cond[key] = cond[key].to(self.accelerator.device)
+            with torch.no_grad():
+                target_embeds = self.ip_model.get_image_embeds(image, skip_uncond=True)
+                crops_to_keep = random.randint(1, len(cond["crops"]))
+                cond["crops"] = cond["crops"][:crops_to_keep]
+                cond_crops = cond["crops"]
+                image_embed_inputs = []
+                for crop_ind in range(len(cond_crops)):
+                    image_embed_inputs.append(self.ip_model.get_image_embeds(cond_crops[crop_ind], skip_uncond=True))
+                input_sequence = torch.cat(image_embed_inputs, dim=1)
+            image_save_path = self.cfg.output_dir / "val_images" / f"{self.train_step}_step_{sample_idx}_images.jpg"
+            image_save_path.parent.mkdir(exist_ok=True, parents=True)
+            save_target_image = image[0]
+            conds_list = []
+            for crop_ind in range(len(cond["crops"])):
+                conds_list.append(cond["crops"][crop_ind][0])
+            # Apply the full diffusion process
+            self.save_images(
+                image=save_target_image,
+                conds=conds_list,
+                cond_sequence=input_sequence[:1],
+                target_embeds=target_embeds[:1],
+                save_path=image_save_path,
+            )
+            if sample_idx == self.cfg.n_val_images:
+                break

training/dataset.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import random
+import traceback
+from pathlib import Path
+import einops
+import numpy as np
+import torchvision.transforms as T
+from PIL import Image
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from utils import bezier_utils
+class PartsDataset(Dataset):
+    def __init__(
+        self,
+        dataset_dir: Path,
+        clip_image_size: int = 224,
+        image_processor=None,
+        max_crops=3,
+        use_ref: bool = True,
+        ref_as_grid: bool = True,
+        grid_size: int = 2,
+        sketch_prob: float = 0.0,
+    ):
+        subdirs = [d for d in dataset_dir.iterdir() if d.is_dir()]
+        all_paths = []
+        self.subdir_dict = {}
+        for subdir in tqdm(subdirs):
+            current_paths = list(subdir.glob("*.jpg"))
+            current_target_paths = [p for p in current_paths if len(str(p.name).split("_")) == 2]
+            if use_ref and len(current_target_paths) < 9:
+                # Skip if not enough target images
+                continue
+            all_paths.extend(current_paths)
+            self.subdir_dict[subdir] = current_target_paths
+        print(f"Percentile of valid subdirs: {len(self.subdir_dict) / len(subdirs)}")
+        self.target_paths = [p for p in all_paths if len(str(p.name).split("_")) == 2]
+        source_paths = [p for p in all_paths if len(str(p.name).split("_")) == 3]
+        self.source_target_mappings = {path: [] for path in self.target_paths}
+        for source_path in source_paths:
+            # Remove last part of the path
+            target_path = Path("_".join(str(source_path).split("_")[:-1]) + ".jpg")
+            if target_path in self.source_target_mappings:
+                self.source_target_mappings[target_path].append(source_path)
+        print(f"Loaded {len(self.target_paths)} target images")
+        self.clip_image_size = clip_image_size
+        self.image_processor = image_processor
+        self.max_crops = max_crops
+        self.use_ref = use_ref
+        self.ref_as_grid = ref_as_grid
+        self.grid_size = grid_size
+        self.sketch_prob = sketch_prob
+    def __len__(self):
+        return len(self.target_paths)
+    def paste_on_background(self, image, background, min_scale=0.4, max_scale=0.8):
+        # Calculate aspect ratio and determine resizing based on the smaller dimension of the background
+        aspect_ratio = image.width / image.height
+        scale = random.uniform(min_scale, max_scale)
+        new_width = int(min(background.width, background.height * aspect_ratio) * scale)
+        new_height = int(new_width / aspect_ratio)
+        # Resize image and calculate position
+        image = image.resize((new_width, new_height), resample=Image.LANCZOS)
+        pos_x = random.randint(0, background.width - new_width)
+        pos_y = random.randint(0, background.height - new_height)
+        # Paste the image using its alpha channel as mask if present
+        background.paste(image, (pos_x, pos_y), image if "A" in image.mode else None)
+        return background
+    def get_random_crop(self, image):
+        crop_percent_x = random.uniform(0.8, 1.0)
+        crop_percent_y = random.uniform(0.8, 1.0)
+        # crop_percent_y = random.uniform(0.1, 0.7)
+        crop_x = int(image.width * crop_percent_x)
+        crop_y = int(image.height * crop_percent_y)
+        x = random.randint(0, image.width - crop_x)
+        y = random.randint(0, image.height - crop_y)
+        return image.crop((x, y, x + crop_x, y + crop_y))
+    def get_empty_image(self):
+        empty_image = Image.new("RGB", (self.clip_image_size, self.clip_image_size), (255, 255, 255))
+        return self.image_processor(empty_image)["pixel_values"][0]
+    def __getitem__(self, i: int):
+        out_dict = {}
+        try:
+            target_path = self.target_paths[i]
+            image = Image.open(target_path).convert("RGB")
+            input_parts = []
+            source_paths = self.source_target_mappings[target_path]
+            n_samples = random.randint(1, len(source_paths))
+            n_samples = min(n_samples, self.max_crops)
+            source_paths = random.sample(source_paths, n_samples)
+            if random.random() < 0.1:
+                # Use empty image, but maybe still pass reference
+                source_paths = []
+            if self.use_ref:
+                subdir = target_path.parent
+                # Take something from same dir
+                potential_refs = list(set(self.subdir_dict[subdir]) - {target_path})
+                # Choose 4 refs
+                reference_paths = random.sample(potential_refs, self.grid_size**2)
+                reference_images = [
+                    np.array(Image.open(reference_path).convert("RGB")) for reference_path in reference_paths
+                ]
+                # Concat all images as grid of 2x2
+                reference_grid = np.stack(reference_images)
+                grid_image = einops.rearrange(
+                    reference_grid,
+                    "(h w) h1 w1 c -> (h h1) (w w1) c",
+                    h=self.grid_size,
+                )
+                reference_image = Image.fromarray(grid_image).resize((512, 512))
+                # Always add the reference image
+                input_parts.append(reference_image)
+            # Sample a subset
+            for source_path in source_paths:
+                source_image = Image.open(source_path).convert("RGB")
+                if random.random() < 0.2:
+                    # Instead of using the source image, use a random crop from the target
+                    source_image = self.get_random_crop(source_image)
+                if random.random() < 0.2:
+                    source_image = T.v2.RandomRotation(degrees=30, expand=True, fill=255)(source_image)
+                object_with_background = Image.new("RGB", image.size, (255, 255, 255))
+                self.paste_on_background(source_image, object_with_background, min_scale=0.8, max_scale=0.95)
+                if self.sketch_prob > 0 and random.random() < self.sketch_prob:
+                    num_lines = random.randint(8, 15)
+                    object_with_background = bezier_utils.get_sketch(
+                        object_with_background,
+                        total_curves=num_lines,
+                        drop_line_prob=0.1,
+                    )
+                input_parts.append(object_with_background)
+            # Always pad to three parts for now
+            actual_max_crops = self.max_crops + 1 if self.use_ref else self.max_crops
+            while len(input_parts) < actual_max_crops:
+                input_parts.append(
+                    Image.new(
+                        "RGB",
+                        (self.clip_image_size, self.clip_image_size),
+                        (255, 255, 255),
+                    )
+                )
+        except Exception as e:
+            print(f"Error processing image: {e}")
+            traceback.print_exc()
+            empty_image = Image.new("RGB", (self.clip_image_size, self.clip_image_size), (255, 255, 255))
+            image = empty_image
+            actual_max_crops = self.max_crops + 1 if self.use_ref else self.max_crops
+            input_parts = [empty_image] * (actual_max_crops)
+        clip_target_image = self.image_processor(image)["pixel_values"][0]
+        clip_parts = [self.image_processor(part)["pixel_values"][0] for part in input_parts]
+        out_dict["crops"] = clip_parts
+        return clip_target_image, out_dict

training/train_config.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import List, Optional, Union
+@dataclass
+class TrainConfig:
+    # Dataset path
+    dataset_path: Union[Path, List[Path]] = Path("datasets/generated/generated_things")
+    # Validation dataset path
+    val_dataset_path: Path = Path("datasets/generated/generated_things_val")
+    # The output directory where the model predictions and checkpoints will be written.
+    output_dir: Path = Path("results/my_pit_model")
+    # GPU device
+    device: str = "cuda:0"
+    # The resolution for input images, all the images will be resized to this size
+    img_size: int = 1024
+    # Batch size (per device) for the training dataloader
+    train_batch_size: int = 1
+    # Initial learning rate (after the potential warmup period) to use
+    lr: float = 1e-5
+    # Dataloader num workers.
+    num_workers: int = 8
+    # The beta1 parameter for the Adam optimizer.
+    adam_beta1: float = 0.9
+    # The beta2 parameter for the Adam optimizer
+    adam_beta2: float = 0.999
+    # Weight decay to use
+    adam_weight_decay: float = 0.0  # 1e-2
+    # Epsilon value for the Adam optimizer
+    adam_epsilon: float = 1e-08
+    # How often save images. Values less zero - disable saving
+    log_image_frequency: int = 500
+    # How often to run validation
+    log_validation: int = 5000
+    # The number of images to save during each validation
+    n_val_images: int = 10
+    # A seed for reproducible training
+    seed: Optional[int] = None
+    # The number of accumulation steps to use
+    gradient_accumulation_steps: int = 1
+    # Whether to use mixed precision training
+    mixed_precision: Optional[str] = "fp16"
+    # Log to wandb
+    report_to: Optional[str] = "wandb"
+    # The number of training steps to run
+    max_train_steps: int = 1000000
+    # Max grad for clipping
+    max_grad_norm: float = 1.0
+    # How often to save checkpoints
+    checkpointing_steps: int = 5000
+    # The path to resume from
+    resume_from_path: Optional[Path] = None
+    # The step to resume from, mainly for logging
+    resume_from_step: Optional[int] = None
+    # DiT number of layers
+    num_layers: int = 8
+    # DiT hidden dimensionality
+    hidden_dim: int = 2048
+    # DiT number of attention heads
+    num_attention_heads: int = 32
+    # Whether to use a reference grid
+    use_ref: bool = False
+    # Max number of crops
+    max_crops: int = 3
+    # Probability of converting to sketch
+    sketch_prob: float = 0.0