Step1X-Edit

Paused

App Files Files Community

listen2you003 commited on 3 days ago

Commit

36de41f

1 Parent(s): 53df0d6

init commit

Browse files

Files changed (19) hide show

.gitattributes +35 -35
README.md +14 -14
app.py +469 -0
modules/__init__.py +0 -0
modules/__pycache__/__init__.cpython-310.pyc +0 -0
modules/__pycache__/attention.cpython-310.pyc +0 -0
modules/__pycache__/autoencoder.cpython-310.pyc +0 -0
modules/__pycache__/conditioner.cpython-310.pyc +0 -0
modules/__pycache__/connector_edit.cpython-310.pyc +0 -0
modules/__pycache__/layers.cpython-310.pyc +0 -0
modules/__pycache__/model_edit.cpython-310.pyc +0 -0
modules/attention.py +133 -0
modules/autoencoder.py +326 -0
modules/conditioner.py +216 -0
modules/connector_edit.py +486 -0
modules/layers.py +640 -0
modules/model_edit.py +143 -0
requirements.txt +13 -0
sampling.py +47 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,35 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,14 +1,14 @@
----
-title: Step1X Edit
-emoji: 💻
-colorFrom: blue
-colorTo: purple
-sdk: gradio
-sdk_version: 5.27.0
-app_file: app.py
-pinned: false
-license: apache-2.0
-short_description: Edit an image based on the given instruction.
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Step1X Edit
+emoji: 💻
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 5.27.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+short_description: Edit an image based on the given instruction.
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,469 @@

+import argparse
+import datetime
+import json
+import itertools
+import math
+import os
+import spaces
+import time
+from pathlib import Path
+import gradio as gr
+import numpy as np
+import torch
+from einops import rearrange, repeat
+from huggingface_hub import snapshot_download
+from PIL import Image, ImageOps
+from safetensors.torch import load_file
+from torchvision.transforms import functional as F
+from tqdm import tqdm
+import sampling
+from modules.autoencoder import AutoEncoder
+from modules.conditioner import Qwen25VL_7b_Embedder as Qwen2VLEmbedder
+from modules.model_edit import Step1XParams, Step1XEdit
+print("TORCH_CUDA", torch.cuda.is_available())
+def load_state_dict(model, ckpt_path, device="cuda", strict=False, assign=True):
+    if Path(ckpt_path).suffix == ".safetensors":
+        state_dict = load_file(ckpt_path, device)
+    else:
+        state_dict = torch.load(ckpt_path, map_location="cpu")
+    missing, unexpected = model.load_state_dict(
+        state_dict, strict=strict, assign=assign
+    )
+    if len(missing) > 0 and len(unexpected) > 0:
+        print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+        print("\n" + "-" * 79 + "\n")
+        print(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected))
+    elif len(missing) > 0:
+        print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+    elif len(unexpected) > 0:
+        print(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected))
+    return model
+def load_models(
+    dit_path=None,
+    ae_path=None,
+    qwen2vl_model_path=None,
+    device="cuda",
+    max_length=256,
+    dtype=torch.bfloat16,
+):
+    qwen2vl_encoder = Qwen2VLEmbedder(
+        qwen2vl_model_path,
+        device=device,
+        max_length=max_length,
+        dtype=dtype,
+    )
+    with torch.device("meta"):
+        ae = AutoEncoder(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        )
+        step1x_params = Step1XParams(
+            in_channels=64,
+            out_channels=64,
+            vec_in_dim=768,
+            context_in_dim=4096,
+            hidden_size=3072,
+            mlp_ratio=4.0,
+            num_heads=24,
+            depth=19,
+            depth_single_blocks=38,
+            axes_dim=[16, 56, 56],
+            theta=10_000,
+            qkv_bias=True,
+        )
+        dit = Step1XEdit(step1x_params)
+    ae = load_state_dict(ae, ae_path)
+    dit = load_state_dict(
+        dit, dit_path
+    )
+    dit = dit.to(device=device, dtype=dtype)
+    ae = ae.to(device=device, dtype=torch.float32)
+    return ae, dit, qwen2vl_encoder
+class ImageGenerator:
+    def __init__(
+        self,
+        dit_path=None,
+        ae_path=None,
+        qwen2vl_model_path=None,
+        device="cuda",
+        max_length=640,
+        dtype=torch.bfloat16,
+    ) -> None:
+        self.device = torch.device(device)
+        self.ae, self.dit, self.llm_encoder = load_models(
+            dit_path=dit_path,
+            ae_path=ae_path,
+            qwen2vl_model_path=qwen2vl_model_path,
+            max_length=max_length,
+            dtype=dtype,
+        )
+    def prepare(self, prompt, img, ref_image, ref_image_raw):
+        bs, _, h, w = img.shape
+        bs, _, ref_h, ref_w = ref_image.shape
+        assert h == ref_h and w == ref_w
+        if bs == 1 and not isinstance(prompt, str):
+            bs = len(prompt)
+        elif bs >= 1 and isinstance(prompt, str):
+            prompt = [prompt] * bs
+        img = rearrange(img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+        ref_img = rearrange(ref_image, "b c (ref_h ph) (ref_w pw) -> b (ref_h ref_w) (c ph pw)", ph=2, pw=2)
+        if img.shape[0] == 1 and bs > 1:
+            img = repeat(img, "1 ... -> bs ...", bs=bs)
+            ref_img = repeat(ref_img, "1 ... -> bs ...", bs=bs)
+        img_ids = torch.zeros(h // 2, w // 2, 3)
+        img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2)[:, None]
+        img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2)[None, :]
+        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
+        ref_img_ids = torch.zeros(ref_h // 2, ref_w // 2, 3)
+        ref_img_ids[..., 1] = ref_img_ids[..., 1] + torch.arange(ref_h // 2)[:, None]
+        ref_img_ids[..., 2] = ref_img_ids[..., 2] + torch.arange(ref_w // 2)[None, :]
+        ref_img_ids = repeat(ref_img_ids, "ref_h ref_w c -> b (ref_h ref_w) c", b=bs)
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        txt, mask = self.llm_encoder(prompt, ref_image_raw)
+        txt_ids = torch.zeros(bs, txt.shape[1], 3)
+        img = torch.cat([img, ref_img.to(device=img.device, dtype=img.dtype)], dim=-2)
+        img_ids = torch.cat([img_ids, ref_img_ids], dim=-2)
+        return {
+            "img": img,
+            "mask": mask,
+            "img_ids": img_ids.to(img.device),
+            "llm_embedding": txt.to(img.device),
+            "txt_ids": txt_ids.to(img.device),
+        }
+    @staticmethod
+    def process_diff_norm(diff_norm, k):
+        pow_result = torch.pow(diff_norm, k)
+        result = torch.where(
+            diff_norm > 1.0,
+            pow_result,
+            torch.where(diff_norm < 1.0, torch.ones_like(diff_norm), diff_norm),
+        )
+        return result
+    def denoise(
+        self,
+        img: torch.Tensor,
+        img_ids: torch.Tensor,
+        llm_embedding: torch.Tensor,
+        txt_ids: torch.Tensor,
+        timesteps: list[float],
+        cfg_guidance: float = 4.5,
+        mask=None,
+        show_progress=False,
+        timesteps_truncate=1.0,
+    ):
+        if show_progress:
+            pbar = tqdm(itertools.pairwise(timesteps), desc='denoising...')
+        else:
+            pbar = itertools.pairwise(timesteps)
+        for t_curr, t_prev in pbar:
+            if img.shape[0] == 1 and cfg_guidance != -1:
+                img = torch.cat([img, img], dim=0)
+            t_vec = torch.full(
+                (img.shape[0],), t_curr, dtype=img.dtype, device=img.device
+            )
+            txt, vec = self.dit.connector(llm_embedding, t_vec, mask)
+            pred = self.dit(
+                img=img,
+                img_ids=img_ids,
+                txt=txt,
+                txt_ids=txt_ids,
+                y=vec,
+                timesteps=t_vec,
+            )
+            if cfg_guidance != -1:
+                cond, uncond = (
+                    pred[0 : pred.shape[0] // 2, :],
+                    pred[pred.shape[0] // 2 :, :],
+                )
+                if t_curr > timesteps_truncate:
+                    diff = cond - uncond
+                    diff_norm = torch.norm(diff, dim=(2), keepdim=True)
+                    pred = uncond + cfg_guidance * (
+                        cond - uncond
+                    ) / self.process_diff_norm(diff_norm, k=0.4)
+                else:
+                    pred = uncond + cfg_guidance * (cond - uncond)
+            tem_img = img[0 : img.shape[0] // 2, :] + (t_prev - t_curr) * pred
+            img_input_length = img.shape[1] // 2
+            img = torch.cat(
+                [
+                tem_img[:, :img_input_length],
+                img[ : img.shape[0] // 2, img_input_length:],
+                ], dim=1
+            )
+        return img[:, :img.shape[1] // 2]
+    @staticmethod
+    def unpack(x: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        return rearrange(
+            x,
+            "b (h w) (c ph pw) -> b c (h ph) (w pw)",
+            h=math.ceil(height / 16),
+            w=math.ceil(width / 16),
+            ph=2,
+            pw=2,
+        )
+    @staticmethod
+    def load_image(image):
+        from PIL import Image
+        if isinstance(image, np.ndarray):
+            image = torch.from_numpy(image).permute(2, 0, 1).float() / 255.0
+            image = image.unsqueeze(0)
+            return image
+        elif isinstance(image, Image.Image):
+            image = F.to_tensor(image.convert("RGB"))
+            image = image.unsqueeze(0)
+            return image
+        elif isinstance(image, torch.Tensor):
+            return image
+        elif isinstance(image, str):
+            image = F.to_tensor(Image.open(image).convert("RGB"))
+            image = image.unsqueeze(0)
+            return image
+        else:
+            raise ValueError(f"Unsupported image type: {type(image)}")
+    def output_process_image(self, resize_img, image_size):
+        res_image = resize_img.resize(image_size)
+        return res_image
+    def input_process_image(self, img, img_size=512):
+        # 1. 打开图片
+        w, h = img.size
+        r = w / h
+        if w > h:
+            w_new = math.ceil(math.sqrt(img_size * img_size * r))
+            h_new = math.ceil(w_new / r)
+        else:
+            h_new = math.ceil(math.sqrt(img_size * img_size / r))
+            w_new = math.ceil(h_new * r)
+        h_new = math.ceil(h_new) // 16 * 16
+        w_new = math.ceil(w_new) // 16 * 16
+        img_resized = img.resize((w_new, h_new))
+        return img_resized, img.size
+    @torch.inference_mode()
+    def generate_image(
+        self,
+        prompt,
+        negative_prompt,
+        ref_images,
+        num_steps,
+        cfg_guidance,
+        seed,
+        num_samples=1,
+        init_image=None,
+        image2image_strength=0.0,
+        show_progress=False,
+        size_level=512,
+    ):
+        assert num_samples == 1, "num_samples > 1 is not supported yet."
+        ref_images_raw, img_info = self.input_process_image(ref_images, img_size=size_level)
+        width, height = ref_images_raw.width, ref_images_raw.height
+        ref_images_raw = self.load_image(ref_images_raw)
+        ref_images_raw = ref_images_raw.to(self.device)
+        ref_images = self.ae.encode(ref_images_raw.to(self.device) * 2 - 1)
+        seed = int(seed)
+        seed = torch.Generator(device="cpu").seed() if seed < 0 else seed
+        t0 = time.perf_counter()
+        if init_image is not None:
+            init_image = self.load_image(init_image)
+            init_image = init_image.to(self.device)
+            init_image = torch.nn.functional.interpolate(init_image, (height, width))
+            init_image = self.ae.encode(init_image.to() * 2 - 1)
+        x = torch.randn(
+            num_samples,
+            16,
+            height // 8,
+            width // 8,
+            device=self.device,
+            dtype=torch.bfloat16,
+            generator=torch.Generator(device=self.device).manual_seed(seed),
+        )
+        timesteps = sampling.get_schedule(
+            num_steps, x.shape[-1] * x.shape[-2] // 4, shift=True
+        )
+        if init_image is not None:
+            t_idx = int((1 - image2image_strength) * num_steps)
+            t = timesteps[t_idx]
+            timesteps = timesteps[t_idx:]
+            x = t * x + (1.0 - t) * init_image.to(x.dtype)
+        x = torch.cat([x, x], dim=0)
+        ref_images = torch.cat([ref_images, ref_images], dim=0)
+        ref_images_raw = torch.cat([ref_images_raw, ref_images_raw], dim=0)
+        inputs = self.prepare([prompt, negative_prompt], x, ref_image=ref_images, ref_image_raw=ref_images_raw)
+        x = self.denoise(
+            **inputs,
+            cfg_guidance=cfg_guidance,
+            timesteps=timesteps,
+            show_progress=show_progress,
+            timesteps_truncate=1.0,
+        )
+        x = self.unpack(x.float(), height, width)
+        with torch.autocast(device_type=self.device.type, dtype=torch.bfloat16):
+            x = self.ae.decode(x)
+            x = x.clamp(-1, 1)
+            x = x.mul(0.5).add(0.5)
+        t1 = time.perf_counter()
+        print(f"Done in {t1 - t0:.1f}s.")
+        images_list = []
+        for img in x.float():
+            images_list.append(self.output_process_image(F.to_pil_image(img), img_info))
+        return images_list
+def prepare_infer_func():
+    # 模型仓库ID（如："bert-base-uncased"）
+    model_repo = "stepfun-ai/Step1X-Edit"
+    # 本地保存路径
+    model_path = "./model_weights"
+    os.makedirs(model_path, exist_ok=True)
+    # 下载模型（包括所有文件）
+    snapshot_download(
+        repo_id=model_repo,
+        local_dir=model_path,
+        local_dir_use_symlinks=False  # 避免使用符号链接
+    )
+    image_edit = ImageGenerator(
+        ae_path=os.path.join(model_path, 'vae.safetensors'),
+        dit_path=os.path.join(model_path, "step1x-edit-i1258.safetensors"),
+        qwen2vl_model_path='Qwen/Qwen2.5-VL-7B-Instruct',
+        max_length=640,
+    )
+    return image_edit.generate_image
+@spaces.GPU
+def inference(prompt, ref_images, seed, size_level, infer_func=None):
+    start_time = time.time()
+    if seed == -1:
+        import random
+        random_seed = random.randint(0, 2**32 - 1)
+    else:
+        random_seed = seed
+    image = infer_func(
+        prompt,
+        negative_prompt="",
+        ref_images=ref_images.convert('RGB'),
+        num_samples=1,
+        num_steps=28,
+        cfg_guidance=6.0,
+        seed=random_seed,
+        show_progress=True,
+        size_level=size_level,
+    )[0]
+    print(f"Time taken: {time.time() - start_time:.2f} seconds")
+    return image, random_seed
+def create_demo():
+    inference_func = prepare_infer_func()
+    with gr.Blocks() as demo:
+        gr.Markdown(
+            """
+            # Step1X-Edit
+            """
+        )
+        with gr.Row():
+            with gr.Column():
+                prompt = gr.Textbox(
+                    label="编辑指令",
+                    value='Remove the person from the image.',
+                )
+                init_image = gr.Image(label="Input Image", type='pil')
+                random_seed = gr.Number(label="Random Seed", value=-1, minimum=-1)
+                size_level = gr.Number(label="size level (recommend 512, 768, 1024, min 512)", value=512, minimum=512)
+                generate_btn = gr.Button("Generate")
+            with gr.Column():
+                output_image = gr.Image(label="Generated Image",type='pil',image_mode='RGB')
+                output_random_seed = gr.Textbox(label="Used Seed", lines=5)
+        from functools import partial
+        generate_btn.click(
+            fn=partial(inference, infer_func=inference_func),
+            inputs=[
+                prompt,
+                init_image,
+                random_seed,
+                size_level,
+            ],
+            outputs=[output_image, output_random_seed],
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch(server_name='0.0.0.0',server_port=7860)

modules/__init__.py ADDED Viewed

File without changes

modules/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (128 Bytes). View file

modules/__pycache__/attention.cpython-310.pyc ADDED Viewed

Binary file (3.13 kB). View file

modules/__pycache__/autoencoder.cpython-310.pyc ADDED Viewed

Binary file (8.78 kB). View file

modules/__pycache__/conditioner.cpython-310.pyc ADDED Viewed

Binary file (4.94 kB). View file

modules/__pycache__/connector_edit.cpython-310.pyc ADDED Viewed

Binary file (11.8 kB). View file

modules/__pycache__/layers.cpython-310.pyc ADDED Viewed

Binary file (19.1 kB). View file

modules/__pycache__/model_edit.cpython-310.pyc ADDED Viewed

Binary file (4.21 kB). View file

modules/attention.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import math
+import torch
+import torch.nn.functional as F
+try:
+    import flash_attn
+    from flash_attn.flash_attn_interface import (
+        _flash_attn_forward,
+        flash_attn_func,
+        flash_attn_varlen_func,
+    )
+except ImportError:
+    flash_attn = None
+    flash_attn_varlen_func = None
+    _flash_attn_forward = None
+    flash_attn_func = None
+MEMORY_LAYOUT = {
+    # flash模式:
+    # 预处理: 输入 [batch_size, seq_len, num_heads, head_dim]
+    # 后处理: 保持形状不变
+    "flash": (
+        lambda x: x,  # 保持形状
+        lambda x: x,  # 保持形状
+    ),
+    # torch/vanilla模式:
+    # 预处理: 交换序列和注意力头的维度 [B,S,A,D] -> [B,A,S,D]
+    # 后处理: 交换回原始维度 [B,A,S,D] -> [B,S,A,D]
+    "torch": (
+        lambda x: x.transpose(1, 2),  # (B,S,A,D) -> (B,A,S,D)
+        lambda x: x.transpose(1, 2),  # (B,A,S,D) -> (B,S,A,D)
+    ),
+    "vanilla": (
+        lambda x: x.transpose(1, 2),
+        lambda x: x.transpose(1, 2),
+    ),
+}
+def attention(
+    q,
+    k,
+    v,
+    mode="torch",
+    drop_rate=0,
+    attn_mask=None,
+    causal=False,
+):
+    """
+    执行QKV自注意力计算
+    Args:
+        q (torch.Tensor): 查询张量，形状 [batch_size, seq_len, num_heads, head_dim]
+        k (torch.Tensor): 键张量，形状 [batch_size, seq_len_kv, num_heads, head_dim]
+        v (torch.Tensor): 值张量，形状 [batch_size, seq_len_kv, num_heads, head_dim]
+        mode (str): 注意力模式，可选 'flash', 'torch', 'vanilla'
+        drop_rate (float): 注意力矩阵的dropout概率
+        attn_mask (torch.Tensor): 注意力掩码，形状根据模式不同而变化
+        causal (bool): 是否使用因果注意力（仅关注前面位置）
+    Returns:
+        torch.Tensor: 注意力输出，形状 [batch_size, seq_len, num_heads * head_dim]
+    """
+    # 获取预处理和后处理函数
+    pre_attn_layout, post_attn_layout = MEMORY_LAYOUT[mode]
+    # 应用预处理变换
+    q = pre_attn_layout(q)  # 形状根据模式变化
+    k = pre_attn_layout(k)
+    v = pre_attn_layout(v)
+    if mode == "torch":
+        # 使用PyTorch原生的scaled_dot_product_attention
+        if attn_mask is not None and attn_mask.dtype != torch.bool:
+            attn_mask = attn_mask.to(q.dtype)
+        x = F.scaled_dot_product_attention(
+            q, k, v, attn_mask=attn_mask, dropout_p=drop_rate, is_causal=causal
+        )
+    elif mode == "flash":
+        assert flash_attn_func is not None, "flash_attn_func未定义"
+        assert attn_mask is None, "不支持的注意力掩码"
+        x: torch.Tensor = flash_attn_func(
+            q, k, v, dropout_p=drop_rate, causal=causal, softmax_scale=None
+        )  # type: ignore
+    elif mode == "vanilla":
+        # 手动实现注意力机制
+        scale_factor = 1 / math.sqrt(q.size(-1))  # 缩放因子 1/sqrt(d_k)
+        b, a, s, _ = q.shape  # 获取形状参数
+        s1 = k.size(2)  # 键值序列长度
+        # 初始化注意力偏置
+        attn_bias = torch.zeros(b, a, s, s1, dtype=q.dtype, device=q.device)
+        # 处理因果掩码
+        if causal:
+            assert attn_mask is None, "因果掩码和注意力掩码不能同时使用"
+            # 生成下三角因果掩码
+            temp_mask = torch.ones(b, a, s, s, dtype=torch.bool, device=q.device).tril(
+                diagonal=0
+            )
+            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+            attn_bias = attn_bias.to(q.dtype)
+        # 处理自定义注意力掩码
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+            else:
+                attn_bias += attn_mask  # 允许类似ALiBi的位置偏置
+        # 计算注意力矩阵
+        attn = (q @ k.transpose(-2, -1)) * scale_factor  # [B,A,S,S1]
+        attn += attn_bias
+        # softmax和dropout
+        attn = attn.softmax(dim=-1)
+        attn = torch.dropout(attn, p=drop_rate, train=True)
+        # 计算输出
+        x = attn @ v  # [B,A,S,D]
+    else:
+        raise NotImplementedError(f"不支持的注意力模式: {mode}")
+    # 应用后处理变换
+    x = post_attn_layout(x)  # 恢复原始维度顺序
+    # 合并注意力头维度
+    b, s, a, d = x.shape
+    out = x.reshape(b, s, -1)  # [B,S,A*D]
+    return out

modules/autoencoder.py ADDED Viewed

	@@ -0,0 +1,326 @@

+# Modified from Flux
+#
+# Copyright 2024 Black Forest Labs
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from einops import rearrange
+from torch import Tensor, nn
+def swish(x: Tensor) -> Tensor:
+    return x * torch.sigmoid(x)
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def forward(self, x: Tensor):
+        pad = (0, 1, 0, 1)
+        x = nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: Tensor):
+        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+        curr_res = resolution
+        in_ch_mult = (1, *tuple(ch_mult))
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: Tensor) -> Tensor:
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+    def forward(self, z: Tensor) -> Tensor:
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class DiagonalGaussian(nn.Module):
+    def __init__(self, sample: bool = True, chunk_dim: int = 1):
+        super().__init__()
+        self.sample = sample
+        self.chunk_dim = chunk_dim
+    def forward(self, z: Tensor) -> Tensor:
+        mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
+        if self.sample:
+            std = torch.exp(0.5 * logvar)
+            return mean + std * torch.randn_like(mean)
+        else:
+            return mean
+class AutoEncoder(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+        scale_factor: float,
+        shift_factor: float,
+    ):
+        super().__init__()
+        self.encoder = Encoder(
+            resolution=resolution,
+            in_channels=in_channels,
+            ch=ch,
+            ch_mult=ch_mult,
+            num_res_blocks=num_res_blocks,
+            z_channels=z_channels,
+        )
+        self.decoder = Decoder(
+            resolution=resolution,
+            in_channels=in_channels,
+            ch=ch,
+            out_ch=out_ch,
+            ch_mult=ch_mult,
+            num_res_blocks=num_res_blocks,
+            z_channels=z_channels,
+        )
+        self.reg = DiagonalGaussian()
+        self.scale_factor = scale_factor
+        self.shift_factor = shift_factor
+    def encode(self, x: Tensor) -> Tensor:
+        z = self.reg(self.encoder(x))
+        z = self.scale_factor * (z - self.shift_factor)
+        return z
+    def decode(self, z: Tensor) -> Tensor:
+        z = z / self.scale_factor + self.shift_factor
+        return self.decoder(z)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.decode(self.encode(x))

modules/conditioner.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import torch
+from qwen_vl_utils import process_vision_info
+from transformers import (
+    AutoProcessor,
+    Qwen2VLForConditionalGeneration,
+    Qwen2_5_VLForConditionalGeneration,
+)
+from torchvision.transforms import ToPILImage
+to_pil = ToPILImage()
+Qwen25VL_7b_PREFIX = '''Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:
+- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.
+- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.\n
+Here are examples of how to transform or refine prompts:
+- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.
+- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.\n
+Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:
+User Prompt:'''
+def split_string(s):
+    # 将中文引号替换为英文引号
+    s = s.replace("“", '"').replace("”", '"')  # use english quotes
+    result = []
+    # 标记是否在引号内
+    in_quotes = False
+    temp = ""
+    # 遍历字符串中的每个字符及其索引
+    for idx, char in enumerate(s):
+        # 如果字符是引号且索引大于 155
+        if char == '"' and idx > 155:
+            # 将引号添加到临时字符串
+            temp += char
+            # 如果不在引号内
+            if not in_quotes:
+                # 将临时字符串添加到结果列表
+                result.append(temp)
+                # 清空临时字符串
+                temp = ""
+            # 切换引号状态
+            in_quotes = not in_quotes
+            continue
+        # 如果在引号内
+        if in_quotes:
+            # 如果字符是空格
+            if char.isspace():
+                pass  # have space token
+            # 将字符用中文引号包裹后添加到结果列表
+            result.append("“" + char + "”")
+        else:
+            # 将字符添加到临时字符串
+            temp += char
+    # 如果临时字符串不为空
+    if temp:
+        # 将临时字符串添加到结果列表
+        result.append(temp)
+    return result
+class Qwen25VL_7b_Embedder(torch.nn.Module):
+    def __init__(self, model_path, max_length=640, dtype=torch.bfloat16, device="cuda"):
+        super(Qwen25VL_7b_Embedder, self).__init__()
+        self.max_length = max_length
+        self.dtype = dtype
+        self.device = device
+        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_path,
+            torch_dtype=dtype,
+            attn_implementation="eager",
+        ).to(torch.cuda.current_device())
+        self.model.requires_grad_(False)
+        self.processor = AutoProcessor.from_pretrained(
+            model_path, min_pixels=256 * 28 * 28, max_pixels=324 * 28 * 28
+        )
+        self.prefix = Qwen25VL_7b_PREFIX
+    def forward(self, caption, ref_images):
+        text_list = caption
+        embs = torch.zeros(
+            len(text_list),
+            self.max_length,
+            self.model.config.hidden_size,
+            dtype=torch.bfloat16,
+            device=torch.cuda.current_device(),
+        )
+        hidden_states = torch.zeros(
+            len(text_list),
+            self.max_length,
+            self.model.config.hidden_size,
+            dtype=torch.bfloat16,
+            device=torch.cuda.current_device(),
+        )
+        masks = torch.zeros(
+            len(text_list),
+            self.max_length,
+            dtype=torch.long,
+            device=torch.cuda.current_device(),
+        )
+        input_ids_list = []
+        attention_mask_list = []
+        emb_list = []
+        def split_string(s):
+            s = s.replace("“", '"').replace("”", '"').replace("'", '''"''')  # use english quotes
+            result = []
+            in_quotes = False
+            temp = ""
+            for idx,char in enumerate(s):
+                if char == '"' and idx>155:
+                    temp += char
+                    if not in_quotes:
+                        result.append(temp)
+                        temp = ""
+                    in_quotes = not in_quotes
+                    continue
+                if in_quotes:
+                    if char.isspace():
+                        pass  # have space token
+                    result.append("“" + char + "”")
+                else:
+                    temp += char
+            if temp:
+                result.append(temp)
+            return result
+        for idx, (txt, imgs) in enumerate(zip(text_list, ref_images)):
+            messages = [{"role": "user", "content": []}]
+            messages[0]["content"].append({"type": "text", "text": f"{self.prefix}"})
+            messages[0]["content"].append({"type": "image", "image": to_pil(imgs)})
+            # 再添加 text
+            messages[0]["content"].append({"type": "text", "text": f"{txt}"})
+            # Preparation for inference
+            text = self.processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True, add_vision_id=True
+            )
+            image_inputs, video_inputs = process_vision_info(messages)
+            inputs = self.processor(
+                text=[text],
+                images=image_inputs,
+                padding=True,
+                return_tensors="pt",
+            )
+            old_inputs_ids = inputs.input_ids
+            text_split_list = split_string(text)
+            token_list = []
+            for text_each in text_split_list:
+                txt_inputs = self.processor(
+                    text=text_each,
+                    images=None,
+                    videos=None,
+                    padding=True,
+                    return_tensors="pt",
+                )
+                token_each = txt_inputs.input_ids
+                if token_each[0][0] == 2073 and token_each[0][-1] == 854:
+                    token_each = token_each[:, 1:-1]
+                    token_list.append(token_each)
+                else:
+                    token_list.append(token_each)
+            new_txt_ids = torch.cat(token_list, dim=1).to("cuda")
+            new_txt_ids = new_txt_ids.to(old_inputs_ids.device)
+            idx1 = (old_inputs_ids == 151653).nonzero(as_tuple=True)[1][0]
+            idx2 = (new_txt_ids == 151653).nonzero(as_tuple=True)[1][0]
+            inputs.input_ids = (
+                torch.cat([old_inputs_ids[0, :idx1], new_txt_ids[0, idx2:]], dim=0)
+                .unsqueeze(0)
+                .to("cuda")
+            )
+            inputs.attention_mask = (inputs.input_ids > 0).long().to("cuda")
+            outputs = self.model(
+                input_ids=inputs.input_ids,
+                attention_mask=inputs.attention_mask,
+                pixel_values=inputs.pixel_values.to("cuda"),
+                image_grid_thw=inputs.image_grid_thw.to("cuda"),
+                output_hidden_states=True,
+            )
+            emb = outputs["hidden_states"][-1]
+            embs[idx, : min(self.max_length, emb.shape[1] - 217)] = emb[0, 217:][
+                : self.max_length
+            ]
+            masks[idx, : min(self.max_length, emb.shape[1] - 217)] = torch.ones(
+                (min(self.max_length, emb.shape[1] - 217)),
+                dtype=torch.long,
+                device=torch.cuda.current_device(),
+            )
+        return embs, masks

modules/connector_edit.py ADDED Viewed

	@@ -0,0 +1,486 @@

+from typing import Optional
+import torch
+import torch.nn
+from einops import rearrange
+from torch import nn
+from .layers import MLP, TextProjection, TimestepEmbedder, apply_gate, attention
+class RMSNorm(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        elementwise_affine=True,
+        eps: float = 1e-6,
+        device=None,
+        dtype=None,
+    ):
+        """
+        Initialize the RMSNorm normalization layer.
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.eps = eps
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim, **factory_kwargs))
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The normalized tensor.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+        """
+        output = self._norm(x.float()).type_as(x)
+        if hasattr(self, "weight"):
+            output = output * self.weight
+        return output
+def get_norm_layer(norm_layer):
+    """
+    Get the normalization layer.
+    Args:
+        norm_layer (str): The type of normalization layer.
+    Returns:
+        norm_layer (nn.Module): The normalization layer.
+    """
+    if norm_layer == "layer":
+        return nn.LayerNorm
+    elif norm_layer == "rms":
+        return RMSNorm
+    else:
+        raise NotImplementedError(f"Norm layer {norm_layer} is not implemented")
+def get_activation_layer(act_type):
+    """get activation layer
+    Args:
+        act_type (str): the activation type
+    Returns:
+        torch.nn.functional: the activation layer
+    """
+    if act_type == "gelu":
+        return lambda: nn.GELU()
+    elif act_type == "gelu_tanh":
+        return lambda: nn.GELU(approximate="tanh")
+    elif act_type == "relu":
+        return nn.ReLU
+    elif act_type == "silu":
+        return nn.SiLU
+    else:
+        raise ValueError(f"Unknown activation type: {act_type}")
+class IndividualTokenRefinerBlock(torch.nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        heads_num,
+        mlp_width_ratio: str = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        need_CA: bool = False,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.need_CA = need_CA
+        self.heads_num = heads_num
+        head_dim = hidden_size // heads_num
+        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+        self.norm1 = nn.LayerNorm(
+            hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
+        )
+        self.self_attn_qkv = nn.Linear(
+            hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs
+        )
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.self_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.self_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.self_attn_proj = nn.Linear(
+            hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
+        )
+        self.norm2 = nn.LayerNorm(
+            hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
+        )
+        act_layer = get_activation_layer(act_type)
+        self.mlp = MLP(
+            in_channels=hidden_size,
+            hidden_channels=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=mlp_drop_rate,
+            **factory_kwargs,
+        )
+        self.adaLN_modulation = nn.Sequential(
+            act_layer(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs),
+        )
+        if self.need_CA:
+            self.cross_attnblock=CrossAttnBlock(hidden_size=hidden_size,
+                        heads_num=heads_num,
+                        mlp_width_ratio=mlp_width_ratio,
+                        mlp_drop_rate=mlp_drop_rate,
+                        act_type=act_type,
+                        qk_norm=qk_norm,
+                        qk_norm_type=qk_norm_type,
+                        qkv_bias=qkv_bias,
+                        **factory_kwargs,)
+        # Zero-initialize the modulation
+        nn.init.zeros_(self.adaLN_modulation[1].weight)
+        nn.init.zeros_(self.adaLN_modulation[1].bias)
+    def forward(
+        self,
+        x: torch.Tensor,
+        c: torch.Tensor,  # timestep_aware_representations + context_aware_representations
+        attn_mask: torch.Tensor = None,
+        y: torch.Tensor = None,
+    ):
+        gate_msa, gate_mlp = self.adaLN_modulation(c).chunk(2, dim=1)
+        norm_x = self.norm1(x)
+        qkv = self.self_attn_qkv(norm_x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
+        # Apply QK-Norm if needed
+        q = self.self_attn_q_norm(q).to(v)
+        k = self.self_attn_k_norm(k).to(v)
+        # Self-Attention
+        attn = attention(q, k, v, mode="torch", attn_mask=attn_mask)
+        x = x + apply_gate(self.self_attn_proj(attn), gate_msa)
+        if self.need_CA:
+            x = self.cross_attnblock(x, c, attn_mask, y)
+        # FFN Layer
+        x = x + apply_gate(self.mlp(self.norm2(x)), gate_mlp)
+        return x
+class CrossAttnBlock(torch.nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        heads_num,
+        mlp_width_ratio: str = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.heads_num = heads_num
+        head_dim = hidden_size // heads_num
+        self.norm1 = nn.LayerNorm(
+            hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
+        )
+        self.norm1_2 = nn.LayerNorm(
+            hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
+        )
+        self.self_attn_q = nn.Linear(
+            hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
+        )
+        self.self_attn_kv = nn.Linear(
+            hidden_size, hidden_size*2, bias=qkv_bias, **factory_kwargs
+        )
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.self_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.self_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.self_attn_proj = nn.Linear(
+            hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
+        )
+        self.norm2 = nn.LayerNorm(
+            hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
+        )
+        act_layer = get_activation_layer(act_type)
+        self.adaLN_modulation = nn.Sequential(
+            act_layer(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs),
+        )
+        # Zero-initialize the modulation
+        nn.init.zeros_(self.adaLN_modulation[1].weight)
+        nn.init.zeros_(self.adaLN_modulation[1].bias)
+    def forward(
+        self,
+        x: torch.Tensor,
+        c: torch.Tensor,  # timestep_aware_representations + context_aware_representations
+        attn_mask: torch.Tensor = None,
+        y: torch.Tensor=None,
+    ):
+        gate_msa, gate_mlp = self.adaLN_modulation(c).chunk(2, dim=1)
+        norm_x = self.norm1(x)
+        norm_y = self.norm1_2(y)
+        q = self.self_attn_q(norm_x)
+        q = rearrange(q, "B L (H D) -> B L H D",  H=self.heads_num)
+        kv = self.self_attn_kv(norm_y)
+        k, v = rearrange(kv, "B L (K H D) -> K B L H D", K=2, H=self.heads_num)
+        # Apply QK-Norm if needed
+        q = self.self_attn_q_norm(q).to(v)
+        k = self.self_attn_k_norm(k).to(v)
+        # Self-Attention
+        attn = attention(q, k, v, mode="torch", attn_mask=attn_mask)
+        x = x + apply_gate(self.self_attn_proj(attn), gate_msa)
+        return x
+class IndividualTokenRefiner(torch.nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        heads_num,
+        depth,
+        mlp_width_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        need_CA:bool=False,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.need_CA = need_CA
+        self.blocks = nn.ModuleList(
+            [
+                IndividualTokenRefinerBlock(
+                    hidden_size=hidden_size,
+                    heads_num=heads_num,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_drop_rate=mlp_drop_rate,
+                    act_type=act_type,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    qkv_bias=qkv_bias,
+                    need_CA=self.need_CA,
+                    **factory_kwargs,
+                )
+                for _ in range(depth)
+            ]
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        c: torch.LongTensor,
+        mask: Optional[torch.Tensor] = None,
+        y:torch.Tensor=None,
+    ):
+        self_attn_mask = None
+        if mask is not None:
+            batch_size = mask.shape[0]
+            seq_len = mask.shape[1]
+            mask = mask.to(x.device)
+            # batch_size x 1 x seq_len x seq_len
+            self_attn_mask_1 = mask.view(batch_size, 1, 1, seq_len).repeat(
+                1, 1, seq_len, 1
+            )
+            # batch_size x 1 x seq_len x seq_len
+            self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
+            # batch_size x 1 x seq_len x seq_len, 1 for broadcasting of heads_num
+            self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
+            # avoids self-attention weight being NaN for padding tokens
+            self_attn_mask[:, :, :, 0] = True
+        for block in self.blocks:
+            x = block(x, c, self_attn_mask,y)
+        return x
+class SingleTokenRefiner(torch.nn.Module):
+    """
+    A single token refiner block for llm text embedding refine.
+    """
+    def __init__(
+        self,
+        in_channels,
+        hidden_size,
+        heads_num,
+        depth,
+        mlp_width_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        need_CA:bool=False,
+        attn_mode: str = "torch",
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.attn_mode = attn_mode
+        self.need_CA = need_CA
+        assert self.attn_mode == "torch", "Only support 'torch' mode for token refiner."
+        self.input_embedder = nn.Linear(
+            in_channels, hidden_size, bias=True, **factory_kwargs
+        )
+        if self.need_CA:
+            self.input_embedder_CA = nn.Linear(
+            in_channels, hidden_size, bias=True, **factory_kwargs
+        )
+        act_layer = get_activation_layer(act_type)
+        # Build timestep embedding layer
+        self.t_embedder = TimestepEmbedder(hidden_size, act_layer, **factory_kwargs)
+        # Build context embedding layer
+        self.c_embedder = TextProjection(
+            in_channels, hidden_size, act_layer, **factory_kwargs
+        )
+        self.individual_token_refiner = IndividualTokenRefiner(
+            hidden_size=hidden_size,
+            heads_num=heads_num,
+            depth=depth,
+            mlp_width_ratio=mlp_width_ratio,
+            mlp_drop_rate=mlp_drop_rate,
+            act_type=act_type,
+            qk_norm=qk_norm,
+            qk_norm_type=qk_norm_type,
+            qkv_bias=qkv_bias,
+            need_CA=need_CA,
+            **factory_kwargs,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.LongTensor,
+        mask: Optional[torch.LongTensor] = None,
+        y: torch.LongTensor=None,
+    ):
+        timestep_aware_representations = self.t_embedder(t)
+        if mask is None:
+            context_aware_representations = x.mean(dim=1)
+        else:
+            mask_float = mask.unsqueeze(-1)  # [b, s1, 1]
+            context_aware_representations = (x * mask_float).sum(
+                dim=1
+            ) / mask_float.sum(dim=1)
+        context_aware_representations = self.c_embedder(context_aware_representations)
+        c = timestep_aware_representations + context_aware_representations
+        x = self.input_embedder(x)
+        if self.need_CA:
+            y = self.input_embedder_CA(y)
+            x = self.individual_token_refiner(x, c, mask, y)
+        else:
+            x = self.individual_token_refiner(x, c, mask)
+        return x
+class Qwen2Connector(torch.nn.Module):
+    def __init__(
+        self,
+        # biclip_dim=1024,
+        in_channels=3584,
+        hidden_size=4096,
+        heads_num=32,
+        depth=2,
+        need_CA=False,
+        device=None,
+        dtype=torch.bfloat16,
+    ):
+        super().__init__()
+        factory_kwargs = {"device": device, "dtype":dtype}
+        self.S =SingleTokenRefiner(in_channels=in_channels,hidden_size=hidden_size,heads_num=heads_num,depth=depth,need_CA=need_CA,**factory_kwargs)
+        self.global_proj_out=nn.Linear(in_channels,768)
+        self.scale_factor = nn.Parameter(torch.zeros(1))
+        with torch.no_grad():
+            self.scale_factor.data += -(1 - 0.09)
+    def forward(self, x,t,mask):
+        mask_float = mask.unsqueeze(-1)  # [b, s1, 1]
+        x_mean = (x * mask_float).sum(
+                dim=1
+            ) / mask_float.sum(dim=1) * (1 + self.scale_factor)
+        global_out=self.global_proj_out(x_mean)
+        encoder_hidden_states = self.S(x,t,mask)
+        return encoder_hidden_states,global_out

modules/layers.py ADDED Viewed

	@@ -0,0 +1,640 @@

+# Modified from Flux
+#
+# Copyright 2024 Black Forest Labs
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math  # noqa: I001
+from dataclasses import dataclass
+from functools import partial
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+# from liger_kernel.ops.rms_norm import LigerRMSNormFunction
+from torch import Tensor, nn
+try:
+    import flash_attn
+    from flash_attn.flash_attn_interface import (
+        _flash_attn_forward,
+        flash_attn_varlen_func,
+    )
+except ImportError:
+    flash_attn = None
+    flash_attn_varlen_func = None
+    _flash_attn_forward = None
+MEMORY_LAYOUT = {
+    "flash": (
+        lambda x: x.view(x.shape[0] * x.shape[1], *x.shape[2:]),
+        lambda x: x,
+    ),
+    "torch": (
+        lambda x: x.transpose(1, 2),
+        lambda x: x.transpose(1, 2),
+    ),
+    "vanilla": (
+        lambda x: x.transpose(1, 2),
+        lambda x: x.transpose(1, 2),
+    ),
+}
+def attention(
+    q,
+    k,
+    v,
+    mode="torch",
+    drop_rate=0,
+    attn_mask=None,
+    causal=False,
+    cu_seqlens_q=None,
+    cu_seqlens_kv=None,
+    max_seqlen_q=None,
+    max_seqlen_kv=None,
+    batch_size=1,
+):
+    """
+    Perform QKV self attention.
+    Args:
+        q (torch.Tensor): Query tensor with shape [b, s, a, d], where a is the number of heads.
+        k (torch.Tensor): Key tensor with shape [b, s1, a, d]
+        v (torch.Tensor): Value tensor with shape [b, s1, a, d]
+        mode (str): Attention mode. Choose from 'self_flash', 'cross_flash', 'torch', and 'vanilla'.
+        drop_rate (float): Dropout rate in attention map. (default: 0)
+        attn_mask (torch.Tensor): Attention mask with shape [b, s1] (cross_attn), or [b, a, s, s1] (torch or vanilla).
+            (default: None)
+        causal (bool): Whether to use causal attention. (default: False)
+        cu_seqlens_q (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
+            used to index into q.
+        cu_seqlens_kv (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
+            used to index into kv.
+        max_seqlen_q (int): The maximum sequence length in the batch of q.
+        max_seqlen_kv (int): The maximum sequence length in the batch of k and v.
+    Returns:
+        torch.Tensor: Output tensor after self attention with shape [b, s, ad]
+    """
+    pre_attn_layout, post_attn_layout = MEMORY_LAYOUT[mode]
+    q = pre_attn_layout(q)
+    k = pre_attn_layout(k)
+    v = pre_attn_layout(v)
+    if mode == "torch":
+        if attn_mask is not None and attn_mask.dtype != torch.bool:
+            attn_mask = attn_mask.to(q.dtype)
+        x = F.scaled_dot_product_attention(
+            q, k, v, attn_mask=attn_mask, dropout_p=drop_rate, is_causal=causal
+        )
+    elif mode == "flash":
+        assert flash_attn_varlen_func is not None
+        x: torch.Tensor = flash_attn_varlen_func(
+            q,
+            k,
+            v,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            max_seqlen_q,
+            max_seqlen_kv,
+        )  # type: ignore
+        # x with shape [(bxs), a, d]
+        x = x.view(batch_size, max_seqlen_q, x.shape[-2], x.shape[-1])  # type: ignore # reshape x to [b, s, a, d]
+    elif mode == "vanilla":
+        scale_factor = 1 / math.sqrt(q.size(-1))
+        b, a, s, _ = q.shape
+        s1 = k.size(2)
+        attn_bias = torch.zeros(b, a, s, s1, dtype=q.dtype, device=q.device)
+        if causal:
+            # Only applied to self attention
+            assert attn_mask is None, (
+                "Causal mask and attn_mask cannot be used together"
+            )
+            temp_mask = torch.ones(b, a, s, s, dtype=torch.bool, device=q.device).tril(
+                diagonal=0
+            )
+            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+            attn_bias.to(q.dtype)
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+            else:
+                attn_bias += attn_mask
+        # TODO: Maybe force q and k to be float32 to avoid numerical overflow
+        attn = (q @ k.transpose(-2, -1)) * scale_factor
+        attn += attn_bias
+        attn = attn.softmax(dim=-1)
+        attn = torch.dropout(attn, p=drop_rate, train=True)
+        x = attn @ v
+    else:
+        raise NotImplementedError(f"Unsupported attention mode: {mode}")
+    x = post_attn_layout(x)
+    b, s, a, d = x.shape
+    out = x.reshape(b, s, -1)
+    return out
+def apply_gate(x, gate=None, tanh=False):
+    """AI is creating summary for apply_gate
+    Args:
+        x (torch.Tensor): input tensor.
+        gate (torch.Tensor, optional): gate tensor. Defaults to None.
+        tanh (bool, optional): whether to use tanh function. Defaults to False.
+    Returns:
+        torch.Tensor: the output tensor after apply gate.
+    """
+    if gate is None:
+        return x
+    if tanh:
+        return x * gate.unsqueeze(1).tanh()
+    else:
+        return x * gate.unsqueeze(1)
+class MLP(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.0,
+        use_conv=False,
+        device=None,
+        dtype=None,
+    ):
+        super().__init__()
+        out_features = out_features or in_channels
+        hidden_channels = hidden_channels or in_channels
+        bias = (bias, bias)
+        drop_probs = (drop, drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+        self.fc1 = linear_layer(
+            in_channels, hidden_channels, bias=bias[0], device=device, dtype=dtype
+        )
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = (
+            norm_layer(hidden_channels, device=device, dtype=dtype)
+            if norm_layer is not None
+            else nn.Identity()
+        )
+        self.fc2 = linear_layer(
+            hidden_channels, out_features, bias=bias[1], device=device, dtype=dtype
+        )
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class TextProjection(nn.Module):
+    """
+    Projects text embeddings. Also handles dropout for classifier-free guidance.
+    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
+    """
+    def __init__(self, in_channels, hidden_size, act_layer, dtype=None, device=None):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        self.linear_1 = nn.Linear(
+            in_features=in_channels,
+            out_features=hidden_size,
+            bias=True,
+            **factory_kwargs,
+        )
+        self.act_1 = act_layer()
+        self.linear_2 = nn.Linear(
+            in_features=hidden_size,
+            out_features=hidden_size,
+            bias=True,
+            **factory_kwargs,
+        )
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(
+        self,
+        hidden_size,
+        act_layer,
+        frequency_embedding_size=256,
+        max_period=10000,
+        out_size=None,
+        dtype=None,
+        device=None,
+    ):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        self.frequency_embedding_size = frequency_embedding_size
+        self.max_period = max_period
+        if out_size is None:
+            out_size = hidden_size
+        self.mlp = nn.Sequential(
+            nn.Linear(
+                frequency_embedding_size, hidden_size, bias=True, **factory_kwargs
+            ),
+            act_layer(),
+            nn.Linear(hidden_size, out_size, bias=True, **factory_kwargs),
+        )
+        nn.init.normal_(self.mlp[0].weight, std=0.02)  # type: ignore
+        nn.init.normal_(self.mlp[2].weight, std=0.02)  # type: ignore
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        Args:
+            t (torch.Tensor): a 1-D Tensor of N indices, one per batch element. These may be fractional.
+            dim (int): the dimension of the output.
+            max_period (int): controls the minimum frequency of the embeddings.
+        Returns:
+            embedding (torch.Tensor): An (N, D) Tensor of positional embeddings.
+        .. ref_link: https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        """
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(
+            t, self.frequency_embedding_size, self.max_period
+        ).type(self.mlp[0].weight.dtype)  # type: ignore
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        return emb.unsqueeze(1)
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+def rope(pos, dim: int, theta: int):
+    assert dim % 2 == 0
+    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    out = torch.stack(
+        [torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1
+    )
+    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    return out.float()
+def attention_after_rope(q, k, v, pe):
+    q, k = apply_rope(q, k, pe)
+    from .attention import attention
+    x = attention(q, k, v, mode="torch")
+    return x
+@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=True)
+def apply_rope(xq, xk, freqs_cis):
+    # 将 num_heads 和 seq_len 的维度交换回原函数的处理顺序
+    xq = xq.transpose(1, 2)  # [batch, num_heads, seq_len, head_dim]
+    xk = xk.transpose(1, 2)
+    # 将 head_dim 拆分为复数部分（实部和虚部）
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    # 应用旋转位置编码（复数乘法）
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    # 恢复张量形状并转置回目标维度顺序
+    xq_out = xq_out.reshape(*xq.shape).type_as(xq).transpose(1, 2)
+    xk_out = xk_out.reshape(*xk.shape).type_as(xk).transpose(1, 2)
+    return xq_out, xk_out
+@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=True)
+def scale_add_residual(
+    x: torch.Tensor, scale: torch.Tensor, residual: torch.Tensor
+) -> torch.Tensor:
+    return x * scale + residual
+@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=True)
+def layernorm_and_scale_shift(
+    x: torch.Tensor, scale: torch.Tensor, shift: torch.Tensor
+) -> torch.Tensor:
+    return torch.nn.functional.layer_norm(x, (x.size(-1),)) * (scale + 1) + shift
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x: Tensor, pe: Tensor) -> Tensor:
+        qkv = self.qkv(x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        x = attention_after_rope(q, k, v, pe=pe)
+        x = self.proj(x)
+        return x
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+    # @staticmethod
+    # def rms_norm_fast(x, weight, eps):
+    #     return LigerRMSNormFunction.apply(
+    #         x,
+    #         weight,
+    #         eps,
+    #         0.0,
+    #         "gemma",
+    #         True,
+    #     )
+    @staticmethod
+    def rms_norm(x, weight, eps):
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps)
+        return (x * rrms).to(dtype=x_dtype) * weight
+    def forward(self, x: Tensor):
+        # return self.rms_norm_fast(x, self.scale, 1e-6)
+        return self.rms_norm(x, self.scale, 1e-6)
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)
+class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
+    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(
+            self.multiplier, dim=-1
+        )
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+class DoubleStreamBlock(nn.Module):
+    def __init__(
+        self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False
+    ):
+        super().__init__()
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.img_mod = Modulation(hidden_size, double=True)
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_attn = SelfAttention(
+            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
+        )
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+        self.txt_mod = Modulation(hidden_size, double=True)
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(
+            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
+        )
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+    def forward(
+        self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor
+    ) -> tuple[Tensor, Tensor]:
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(
+            img_qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads
+        )
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(
+            txt_qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads
+        )
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+        # run actual attention
+        q = torch.cat((txt_q, img_q), dim=1)
+        k = torch.cat((txt_k, img_k), dim=1)
+        v = torch.cat((txt_v, img_v), dim=1)
+        attn = attention_after_rope(q, k, v, pe=pe)
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+        # calculate the img bloks
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img_mlp = self.img_mlp(
+            (1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift
+        )
+        img = scale_add_residual(img_mlp, img_mod2.gate, img)
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt_mlp = self.txt_mlp(
+            (1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift
+        )
+        txt = scale_add_residual(txt_mlp, txt_mod2.gate, txt)
+        return img, txt
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+        self.norm = QKNorm(head_dim)
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(hidden_size, double=False)
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
+        mod, _ = self.modulation(vec)
+        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(
+            self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1
+        )
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        # compute attention
+        attn = attention_after_rope(q, k, v, pe=pe)
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        return scale_add_residual(output, mod.gate, x)
+class LastLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(
+            hidden_size, patch_size * patch_size * out_channels, bias=True
+        )
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x

modules/model_edit.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import math
+from dataclasses import dataclass
+import numpy as np
+import torch
+from torch import Tensor, nn
+from .connector_edit import Qwen2Connector
+from .layers import DoubleStreamBlock, EmbedND, LastLayer, MLPEmbedder, SingleStreamBlock
+@dataclass
+class Step1XParams:
+    in_channels: int
+    out_channels: int
+    vec_in_dim: int
+    context_in_dim: int
+    hidden_size: int
+    mlp_ratio: float
+    num_heads: int
+    depth: int
+    depth_single_blocks: int
+    axes_dim: list[int]
+    theta: int
+    qkv_bias: bool
+class Step1XEdit(nn.Module):
+    """
+    Transformer model for flow matching on sequences.
+    """
+    def __init__(self, params: Step1XParams):
+        super().__init__()
+        self.params = params
+        self.in_channels = params.in_channels
+        self.out_channels = params.out_channels
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(
+                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
+            )
+        pe_dim = params.hidden_size // params.num_heads
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(
+                f"Got {params.axes_dim} but expected positional dim {pe_dim}"
+            )
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(
+            dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim
+        )
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
+        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                    qkv_bias=params.qkv_bias,
+                )
+                for _ in range(params.depth)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(
+                    self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio
+                )
+                for _ in range(params.depth_single_blocks)
+            ]
+        )
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+        self.connector = Qwen2Connector()
+    @staticmethod
+    def timestep_embedding(
+        t: Tensor, dim, max_period=10000, time_factor: float = 1000.0
+    ):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                        These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        t = time_factor * t
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        ).to(t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+        if torch.is_floating_point(t):
+            embedding = embedding.to(t)
+        return embedding
+    def forward(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+    ) -> Tensor:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+        img = self.img_in(img)
+        vec = self.time_in(self.timestep_embedding(timesteps, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+        for block in self.double_blocks:
+            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+        img = torch.cat((txt, img), 1)
+        for block in self.single_blocks:
+            img = block(img, vec=vec, pe=pe)
+        img = img[:, txt.shape[1] :, ...]
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+einops
+transformers==4.49.0
+qwen_vl_utils==0.0.10
+safetensors==0.4.5
+pillow==11.1.0
+huggingface_hub
+transformers
+diffusers
+peft
+opencv-python
+sentencepiece
+boto3
+torchvision

sampling.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import math
+from collections.abc import Callable
+import torch
+from torch import Tensor
+def get_noise(num_samples: int, height: int, width: int, device: torch.device, dtype: torch.dtype, seed: int):
+    return torch.randn(
+        num_samples,
+        16,
+        # allow for packing
+        2 * math.ceil(height / 16),
+        2 * math.ceil(width / 16),
+        device=device,
+        dtype=dtype,
+        generator=torch.Generator(device=device).manual_seed(seed),
+    )
+def time_shift(mu: float, sigma: float, t: Tensor):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+def get_lin_function(x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15) -> Callable[[float], float]:
+    m = (y2 - y1) / (x2 - x1)
+    b = y1 - m * x1
+    return lambda x: m * x + b
+def get_schedule(
+    num_steps: int,
+    image_seq_len: int,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+    shift: bool = True,
+) -> list[float]:
+    # extra step for zero
+    timesteps = torch.linspace(1, 0, num_steps + 1)
+    # shifting the schedule to favor high timesteps for higher signal images
+    if shift:
+        # estimate mu based on linear estimation between two points
+        mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
+        timesteps = time_shift(mu, 1.0, timesteps)
+    return timesteps.tolist()