Spaces:

gnri
/

RNRI

Runtime error

App Files Files Community

Barak1 commited on Jun 9, 2024

Commit

d48db0f

2 Parent(s): 65ace7f dc77641

Merge branch 'main' of https://huggingface.co/spaces/barakmeiri/RNRI

Browse files

Files changed (7) hide show

app.py +72 -77
requirements.txt +7 -5
src/config.py +17 -0
src/editor.py +88 -0
src/euler_scheduler.py +584 -0
src/eunms.py +26 -0
src/sdxl_inversion_pipeline.py +375 -0

app.py CHANGED Viewed

@@ -3,38 +3,50 @@ import numpy as np
 import random
 from diffusers import DiffusionPipeline
 import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
-if torch.cuda.is_available():
-    torch.cuda.max_memory_allocated(device=device)
-    pipe = DiffusionPipeline.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16", use_safetensors=True)
-    pipe.enable_xformers_memory_efficient_attention()
-    pipe = pipe.to(device)
-else:
-    pipe = DiffusionPipeline.from_pretrained("stabilityai/sdxl-turbo", use_safetensors=True)
-    pipe = pipe.to(device)
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1024
-def infer(prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps):
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    image = pipe(
-        prompt = prompt,
-        negative_prompt = negative_prompt,
-        guidance_scale = guidance_scale,
-        num_inference_steps = num_inference_steps,
-        width = width,
-        height = height,
-        generator = generator
-    ).images[0]
     return image
 examples = [
@@ -56,63 +68,38 @@ else:
     power_device = "CPU"
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
-        gr.Markdown(f"""
-        # Text-to-Image Gradio Template
-        Currently running on {power_device}.
-        """)
         with gr.Row():
-            prompt = gr.Text(
-                label="Prompt",
                 show_label=False,
                 max_lines=1,
-                placeholder="Enter your prompt",
                 container=False,
             )
-            run_button = gr.Button("Run", scale=0)
-        result = gr.Image(label="Result", show_label=False)
-        with gr.Accordion("Advanced Settings", open=False):
-            negative_prompt = gr.Text(
-                label="Negative prompt",
                 max_lines=1,
-                placeholder="Enter a negative prompt",
-                visible=False,
-            )
-            seed = gr.Slider(
-                label="Seed",
-                minimum=0,
-                maximum=MAX_SEED,
-                step=1,
-                value=0,
             )
-            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-            with gr.Row():
-                width = gr.Slider(
-                    label="Width",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=512,
-                )
-                height = gr.Slider(
-                    label="Height",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=512,
-                )
             with gr.Row():
@@ -121,25 +108,33 @@ with gr.Blocks(css=css) as demo:
                     minimum=0.0,
                     maximum=10.0,
                     step=0.1,
-                    value=0.0,
                 )
                 num_inference_steps = gr.Slider(
-                    label="Number of inference steps",
                     minimum=1,
                     maximum=12,
                     step=1,
-                    value=2,
                 )
-        gr.Examples(
-            examples = examples,
-            inputs = [prompt]
-        )
     run_button.click(
         fn = infer,
-        inputs = [prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps],
         outputs = [result]
     )

 import random
 from diffusers import DiffusionPipeline
 import torch
+from src.euler_scheduler import MyEulerAncestralDiscreteScheduler
+from diffusers.pipelines.auto_pipeline import AutoPipelineForImage2Image
+from src.sdxl_inversion_pipeline import SDXLDDIMPipeline
+from src.config import RunConfig
+from src.editor import ImageEditorDemo
 device = "cuda" if torch.cuda.is_available() else "cpu"
+scheduler_class = MyEulerAncestralDiscreteScheduler
+pipe_inversion = SDXLDDIMPipeline.from_pretrained("stabilityai/sdxl-turbo", use_safetensors=True).to(device)
+pipe_inference = AutoPipelineForImage2Image.from_pretrained("stabilityai/sdxl-turbo", use_safetensors=True).to(device)
+pipe_inference.scheduler            = scheduler_class.from_config(pipe_inference.scheduler.config)
+pipe_inversion.scheduler            = scheduler_class.from_config(pipe_inversion.scheduler.config)
+pipe_inversion.scheduler_inference  = scheduler_class.from_config(pipe_inference.scheduler.config)
+# if torch.cuda.is_available():
+#     torch.cuda.max_memory_allocated(device=device)
+#     pipe = DiffusionPipeline.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16", use_safetensors=True)
+#     pipe.enable_xformers_memory_efficient_attention()
+#     pipe = pipe.to(device)
+# else:
+#     pipe = DiffusionPipeline.from_pretrained("stabilityai/sdxl-turbo", use_safetensors=True)
+#     pipe = pipe.to(device)
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1024
+def infer(input_image, description_prompt, target_prompt, guidance_scale, num_inference_steps=4, num_inversion_steps=4, inversion_max_step=0.6):
+    config = RunConfig(num_inference_steps=num_inference_steps,
+                       num_inversion_steps=num_inversion_steps,
+                       guidance_scale=guidance_scale,
+                       inversion_max_step=inversion_max_step)
+    editor = ImageEditorDemo(pipe_inversion, pipe_inference, input_image, description_prompt, config)
+    image = editor.edit(target_prompt)
     return image
 examples = [
     power_device = "CPU"
 with gr.Blocks(css=css) as demo:
+    gr.Markdown(f"""
+    # RNRI briel and links on device: {power_device}.
+    """)
     with gr.Column(elem_id="col-container"):
+        with gr.Row():
+            input_image = gr.Image(label="Input image", sources=['upload', 'webcam', 'clipboard'], type="pil")
         with gr.Row():
+            description_prompt = gr.Text(
+                label="Image description",
                 show_label=False,
                 max_lines=1,
+                placeholder="Enter your image description",
                 container=False,
             )
+        with gr.Row():
+            target_prompt = gr.Text(
+                label="Edit prompt",
+                show_label=False,
                 max_lines=1,
+                placeholder="Enter your edit prompt",
+                container=False,
             )
+        with gr.Accordion("Advanced Settings", open=False):
             with gr.Row():
                     minimum=0.0,
                     maximum=10.0,
                     step=0.1,
+                    value=1.2,
                 )
                 num_inference_steps = gr.Slider(
+                    label="Number of RNRI iterations",
                     minimum=1,
                     maximum=12,
                     step=1,
+                    value=4,
                 )
+        with gr.Row():
+            run_button = gr.Button("Edit", scale=0)
+    with gr.Column(elem_id="col-container"):
+        result = gr.Image(label="Result", show_label=False)
+        # gr.Examples(
+        #     examples = examples,
+        #     inputs = [prompt]
+        # )
     run_button.click(
         fn = infer,
+        inputs = [input_image, description_prompt, target_prompt, guidance_scale, num_inference_steps, num_inference_steps],
         outputs = [result]
     )

requirements.txt CHANGED Viewed

@@ -1,6 +1,8 @@
-accelerate
-diffusers
 invisible_watermark
-torch
-transformers
-xformers

+accelerate==0.25.0
+diffusers==0.24.0
 invisible_watermark
+torch==2.2.0
+transformers==4.32.1
+xformers
+torchvision==0.17.0
+pyrallis==0.3.1

src/config.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Code is based on ReNoise https://github.com/garibida/ReNoise-Inversion
+from dataclasses import dataclass
+@dataclass
+class RunConfig:
+    num_inference_steps: int = 4
+    num_inversion_steps: int = 100
+    guidance_scale: float = 0.0
+    inversion_max_step: float = 1.0
+    def __post_init__(self):
+        pass

src/editor.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import torch
+from src.config import RunConfig
+import PIL
+from src.euler_scheduler import MyEulerAncestralDiscreteScheduler
+from diffusers.pipelines.auto_pipeline import AutoPipelineForImage2Image
+from src.sdxl_inversion_pipeline import SDXLDDIMPipeline
+from diffusers.utils.torch_utils import randn_tensor
+def inversion_callback(pipe, step, timestep, callback_kwargs):
+    return callback_kwargs
+def inference_callback(pipe, step, timestep, callback_kwargs):
+    return callback_kwargs
+def center_crop(im):
+    width, height = im.size  # Get dimensions
+    min_dim = min(width, height)
+    left = (width - min_dim) / 2
+    top = (height - min_dim) / 2
+    right = (width + min_dim) / 2
+    bottom = (height + min_dim) / 2
+    # Crop the center of the image
+    im = im.crop((left, top, right, bottom))
+    return im
+def load_im_into_format_from_path(im_path):
+    if isinstance(im_path, str):
+        return center_crop(PIL.Image.open(im_path)).resize((512, 512))
+    else:
+        return center_crop(im_path).resize((512, 512))
+class ImageEditorDemo:
+    def __init__(self, pipe_inversion, pipe_inference, input_image, description_prompt, cfg):
+        self.pipe_inversion = pipe_inversion
+        self.pipe_inference = pipe_inference
+        self.original_image = load_im_into_format_from_path(input_image).convert("RGB")
+        self.load_image = True
+        g_cpu = torch.Generator().manual_seed(7865)
+        img_size = (512,512)
+        VQAE_SCALE = 8
+        latents_size = (1, 4, img_size[0] // VQAE_SCALE, img_size[1] // VQAE_SCALE)
+        noise = [randn_tensor(latents_size, dtype=torch.float16, device=torch.device("cuda:0"), generator=g_cpu) for i
+                 in range(cfg.num_inversion_steps)]
+        pipe_inversion.scheduler.set_noise_list(noise)
+        pipe_inference.scheduler.set_noise_list(noise)
+        pipe_inversion.scheduler_inference.set_noise_list(noise)
+        pipe_inversion.set_progress_bar_config(disable=True)
+        pipe_inference.set_progress_bar_config(disable=True)
+        self.cfg = cfg
+        self.pipe_inversion.cfg = cfg
+        self.pipe_inference.cfg = cfg
+        self.inv_hp = [2, 0.1, 0.2]
+        self.edit_cfg = 1.2
+        self.pipe_inference.to("cuda")
+        self.pipe_inversion.to("cuda")
+        self.last_latent = self.invert(self.original_image, description_prompt)
+        self.original_latent = self.last_latent
+    def invert(self, init_image, base_prompt):
+        res = self.pipe_inversion(prompt=base_prompt,
+                             num_inversion_steps=self.cfg.num_inversion_steps,
+                             num_inference_steps=self.cfg.num_inference_steps,
+                             image=init_image,
+                             guidance_scale=self.cfg.guidance_scale,
+                             callback_on_step_end=inversion_callback,
+                             strength=self.cfg.inversion_max_step,
+                             denoising_start=1.0 - self.cfg.inversion_max_step,
+                             inv_hp=self.inv_hp)[0][0]
+        return res
+    def edit(self, target_prompt):
+        image = self.pipe_inference(prompt=target_prompt,
+                            num_inference_steps=self.cfg.num_inference_steps,
+                            negative_prompt="",
+                            callback_on_step_end=inference_callback,
+                            image=self.last_latent,
+                            strength=self.cfg.inversion_max_step,
+                            denoising_start=1.0 - self.cfg.inversion_max_step,
+                            guidance_scale=self.edit_cfg).images[0]
+        return image

src/euler_scheduler.py ADDED Viewed

	@@ -0,0 +1,584 @@

+# Code is based on ReNoise https://github.com/garibida/ReNoise-Inversion
+from diffusers import EulerAncestralDiscreteScheduler
+from diffusers.utils import BaseOutput
+import torch
+from typing import List, Optional, Tuple, Union
+import numpy as np
+from src.eunms import Epsilon_Update_Type
+class EulerAncestralDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+    prev_sample: torch.FloatTensor
+    pred_original_sample: Optional[torch.FloatTensor] = None
+class MyEulerAncestralDiscreteScheduler(EulerAncestralDiscreteScheduler):
+    def set_noise_list(self, noise_list):
+        self.noise_list = noise_list
+    def get_noise_to_remove(self):
+        sigma_from = self.sigmas[self.step_index]
+        sigma_to = self.sigmas[self.step_index + 1]
+        sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+        return self.noise_list[self.step_index] * sigma_up\
+    def scale_model_input(
+        self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor]
+    ) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        self._init_step_index(timestep.view((1)))
+        return EulerAncestralDiscreteScheduler.scale_model_input(self, sample, timestep)
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a
+                [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or tuple.
+        Returns:
+            [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`,
+                [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] is returned,
+                otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+        if not self.is_scale_input_called:
+            logger.warning(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example."
+            )
+        self._init_step_index(timestep.view((1)))
+        sigma = self.sigmas[self.step_index]
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = sample - sigma * model_output
+        elif self.config.prediction_type == "v_prediction":
+            # * c_out + input * c_skip
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+        elif self.config.prediction_type == "sample":
+            raise NotImplementedError("prediction_type not implemented yet: sample")
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+        sigma_from = self.sigmas[self.step_index]
+        sigma_to = self.sigmas[self.step_index + 1]
+        sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+        sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+        # 2. Convert to an ODE derivative
+        # derivative = (sample - pred_original_sample) / sigma
+        derivative = model_output
+        dt = sigma_down - sigma
+        prev_sample = sample + derivative * dt
+        device = model_output.device
+        # noise = randn_tensor(model_output.shape, dtype=model_output.dtype, device=device, generator=generator)
+        # prev_sample = prev_sample + noise * sigma_up
+        prev_sample = prev_sample + self.noise_list[self.step_index] * sigma_up
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+        # upon completion increase step index by one
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return EulerAncestralDiscreteSchedulerOutput(
+            prev_sample=prev_sample, pred_original_sample=pred_original_sample
+        )
+    def step_and_update_noise(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        expected_prev_sample: torch.FloatTensor,
+        update_epsilon_type=Epsilon_Update_Type.OVERRIDE,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a
+                [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or tuple.
+        Returns:
+            [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`,
+                [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] is returned,
+                otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+        if not self.is_scale_input_called:
+            logger.warning(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example."
+            )
+        self._init_step_index(timestep.view((1)))
+        sigma = self.sigmas[self.step_index]
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = sample - sigma * model_output
+        elif self.config.prediction_type == "v_prediction":
+            # * c_out + input * c_skip
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+        elif self.config.prediction_type == "sample":
+            raise NotImplementedError("prediction_type not implemented yet: sample")
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+        sigma_from = self.sigmas[self.step_index]
+        sigma_to = self.sigmas[self.step_index + 1]
+        sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+        sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+        # 2. Convert to an ODE derivative
+        # derivative = (sample - pred_original_sample) / sigma
+        derivative = model_output
+        dt = sigma_down - sigma
+        prev_sample = sample + derivative * dt
+        device = model_output.device
+        # noise = randn_tensor(model_output.shape, dtype=model_output.dtype, device=device, generator=generator)
+        # prev_sample = prev_sample + noise * sigma_up
+        if sigma_up > 0:
+            req_noise = (expected_prev_sample - prev_sample) / sigma_up
+            if update_epsilon_type == Epsilon_Update_Type.OVERRIDE:
+                self.noise_list[self.step_index] = req_noise
+            else:
+                for i in range(10):
+                    n = torch.autograd.Variable(self.noise_list[self.step_index].detach().clone(), requires_grad=True)
+                    loss = torch.norm(n - req_noise.detach())
+                    loss.backward()
+                    self.noise_list[self.step_index] -= n.grad.detach() * 1.8
+        prev_sample = prev_sample + self.noise_list[self.step_index] * sigma_up
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+        # upon completion increase step index by one
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return EulerAncestralDiscreteSchedulerOutput(
+            prev_sample=prev_sample, pred_original_sample=pred_original_sample
+        )
+    def inv_step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a
+                [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or tuple.
+        Returns:
+            [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`,
+                [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] is returned,
+                otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+        if not self.is_scale_input_called:
+            logger.warning(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example."
+            )
+        self._init_step_index(timestep.view((1)))
+        sigma = self.sigmas[self.step_index]
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = sample - sigma * model_output
+        elif self.config.prediction_type == "v_prediction":
+            # * c_out + input * c_skip
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+        elif self.config.prediction_type == "sample":
+            raise NotImplementedError("prediction_type not implemented yet: sample")
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+        sigma_from = self.sigmas[self.step_index]
+        sigma_to = self.sigmas[self.step_index+1]
+        # sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+        sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2).abs() / sigma_from**2) ** 0.5
+        # sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+        sigma_down = sigma_to**2 / sigma_from
+        # 2. Convert to an ODE derivative
+        # derivative = (sample - pred_original_sample) / sigma
+        derivative = model_output
+        dt = sigma_down - sigma
+        # dt = sigma_down - sigma_from
+        prev_sample = sample - derivative * dt
+        device = model_output.device
+        # noise = randn_tensor(model_output.shape, dtype=model_output.dtype, device=device, generator=generator)
+        # prev_sample = prev_sample + noise * sigma_up
+        prev_sample = prev_sample - self.noise_list[self.step_index] * sigma_up
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+        # upon completion increase step index by one
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return EulerAncestralDiscreteSchedulerOutput(
+            prev_sample=prev_sample, pred_original_sample=pred_original_sample
+        )
+    def get_all_sigmas(self) -> torch.FloatTensor:
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
+        return torch.from_numpy(sigmas)
+    def add_noise_off_schedule(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.get_all_sigmas()
+        sigmas = sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            timesteps = timesteps.to(original_samples.device)
+        step_indices = 1000 - int(timesteps.item())
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+    # def update_noise_for_friendly_inversion(
+    #     self,
+    #     model_output: torch.FloatTensor,
+    #     timestep: Union[float, torch.FloatTensor],
+    #     z_t: torch.FloatTensor,
+    #     z_tp1: torch.FloatTensor,
+    #     return_dict: bool = True,
+    # ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]:
+    #     if (
+    #         isinstance(timestep, int)
+    #         or isinstance(timestep, torch.IntTensor)
+    #         or isinstance(timestep, torch.LongTensor)
+    #     ):
+    #         raise ValueError(
+    #             (
+    #                 "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+    #                 " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+    #                 " one of the `scheduler.timesteps` as a timestep."
+    #             ),
+    #         )
+    #     if not self.is_scale_input_called:
+    #         logger.warning(
+    #             "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+    #             "See `StableDiffusionPipeline` for a usage example."
+    #         )
+    #     self._init_step_index(timestep.view((1)))
+    #     sigma = self.sigmas[self.step_index]
+    #     sigma_from = self.sigmas[self.step_index]
+    #     sigma_to = self.sigmas[self.step_index+1]
+    #     # sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+    #     sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2).abs() / sigma_from**2) ** 0.5
+    #     # sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+    #     sigma_down = sigma_to**2 / sigma_from
+    #     # 2. Conv = (sample - pred_original_sample) / sigma
+    #     derivative = model_output
+    #     dt = sigma_down - sigma
+    #     # dt = sigma_down - sigma_from
+    #     prev_sample = z_t - derivative * dt
+    #     if sigma_up > 0:
+    #         self.noise_list[self.step_index] = (prev_sample - z_tp1) / sigma_up
+    #     prev_sample = prev_sample - self.noise_list[self.step_index] * sigma_up
+    #     if not return_dict:
+    #         return (prev_sample,)
+    #     return EulerAncestralDiscreteSchedulerOutput(
+    #         prev_sample=prev_sample, pred_original_sample=None
+    #     )
+    # def step_friendly_inversion(
+    #     self,
+    #     model_output: torch.FloatTensor,
+    #     timestep: Union[float, torch.FloatTensor],
+    #     sample: torch.FloatTensor,
+    #     generator: Optional[torch.Generator] = None,
+    #     return_dict: bool = True,
+    #     expected_next_sample: torch.FloatTensor = None,
+    # ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]:
+    #     """
+    #     Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+    #     process from the learned model outputs (most often the predicted noise).
+    #     Args:
+    #         model_output (`torch.FloatTensor`):
+    #             The direct output from learned diffusion model.
+    #         timestep (`float`):
+    #             The current discrete timestep in the diffusion chain.
+    #         sample (`torch.FloatTensor`):
+    #             A current instance of a sample created by the diffusion process.
+    #         generator (`torch.Generator`, *optional*):
+    #             A random number generator.
+    #         return_dict (`bool`):
+    #             Whether or not to return a
+    #             [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or tuple.
+    #     Returns:
+    #         [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or `tuple`:
+    #             If return_dict is `True`,
+    #             [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] is returned,
+    #             otherwise a tuple is returned where the first element is the sample tensor.
+    #     """
+    #     if (
+    #         isinstance(timestep, int)
+    #         or isinstance(timestep, torch.IntTensor)
+    #         or isinstance(timestep, torch.LongTensor)
+    #     ):
+    #         raise ValueError(
+    #             (
+    #                 "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+    #                 " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+    #                 " one of the `scheduler.timesteps` as a timestep."
+    #             ),
+    #         )
+    #     if not self.is_scale_input_called:
+    #         logger.warning(
+    #             "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+    #             "See `StableDiffusionPipeline` for a usage example."
+    #         )
+    #     self._init_step_index(timestep.view((1)))
+    #     sigma = self.sigmas[self.step_index]
+    #     # Upcast to avoid precision issues when computing prev_sample
+    #     sample = sample.to(torch.float32)
+    #     # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+    #     if self.config.prediction_type == "epsilon":
+    #         pred_original_sample = sample - sigma * model_output
+    #     elif self.config.prediction_type == "v_prediction":
+    #         # * c_out + input * c_skip
+    #         pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+    #     elif self.config.prediction_type == "sample":
+    #         raise NotImplementedError("prediction_type not implemented yet: sample")
+    #     else:
+    #         raise ValueError(
+    #             f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+    #         )
+    #     sigma_from = self.sigmas[self.step_index]
+    #     sigma_to = self.sigmas[self.step_index + 1]
+    #     sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+    #     sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+    #     # 2. Convert to an ODE derivative
+    #     # derivative = (sample - pred_original_sample) / sigma
+    #     derivative = model_output
+    #     dt = sigma_down - sigma
+    #     prev_sample = sample + derivative * dt
+    #     device = model_output.device
+    #     # noise = randn_tensor(model_output.shape, dtype=model_output.dtype, device=device, generator=generator)
+    #     # prev_sample = prev_sample + noise * sigma_up
+    #     if sigma_up > 0:
+    #         self.noise_list[self.step_index] = (expected_next_sample - prev_sample) / sigma_up
+    #     prev_sample = prev_sample + self.noise_list[self.step_index] * sigma_up
+    #     # Cast sample back to model compatible dtype
+    #     prev_sample = prev_sample.to(model_output.dtype)
+    #     # upon completion increase step index by one
+    #     self._step_index += 1
+    #     if not return_dict:
+    #         return (prev_sample,)
+    #     return EulerAncestralDiscreteSchedulerOutput(
+    #         prev_sample=prev_sample, pred_original_sample=pred_original_sample
+    #     )

src/eunms.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from enum import Enum
+class Scheduler_Type(Enum):
+    DDIM = 1
+    EULER = 2
+    LCM = 3
+    DDPM = 4
+class Model_Type(Enum):
+    SDXL = 1
+    SDXL_Turbo = 2
+    LCM_SDXL = 3
+    SD15 = 4
+    SD21 = 5
+    SD21_Turbo = 6
+    SD14 = 7
+class Gradient_Averaging_Type(Enum):
+    NONE = 1
+    EACH_ITER = 2
+    ON_END = 3
+class Epsilon_Update_Type(Enum):
+    NONE = 1
+    OVERRIDE = 2
+    OPTIMIZE = 3

src/sdxl_inversion_pipeline.py ADDED Viewed

	@@ -0,0 +1,375 @@

+# Code is based on ReNoise https://github.com/garibida/ReNoise-Inversion
+import torch
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from diffusers import (
+    StableDiffusionXLImg2ImgPipeline,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import (
+    StableDiffusionXLPipelineOutput,
+    retrieve_timesteps,
+    PipelineImageInput
+)
+from src.eunms import Epsilon_Update_Type
+def _backward_ddim(x_tm1, alpha_t, alpha_tm1, eps_xt):
+    """
+    let a = alpha_t, b = alpha_{t - 1}
+    We have a > b,
+    x_{t} - x_{t - 1} = sqrt(a) ((sqrt(1/b) - sqrt(1/a)) * x_{t-1} + (sqrt(1/a - 1) - sqrt(1/b - 1)) * eps_{t-1})
+    From https://arxiv.org/pdf/2105.05233.pdf, section F.
+    """
+    a, b = alpha_t, alpha_tm1
+    sa = a ** 0.5
+    sb = b ** 0.5
+    return sa * ((1 / sb) * x_tm1 + ((1 / a - 1) ** 0.5 - (1 / b - 1) ** 0.5) * eps_xt)
+class SDXLDDIMPipeline(StableDiffusionXLImg2ImgPipeline):
+    # @torch.no_grad()
+    def __call__(
+            self,
+            prompt: Union[str, List[str]] = None,
+            prompt_2: Optional[Union[str, List[str]]] = None,
+            image: PipelineImageInput = None,
+            strength: float = 0.3,
+            num_inversion_steps: int = 50,
+            timesteps: List[int] = None,
+            denoising_start: Optional[float] = None,
+            denoising_end: Optional[float] = None,
+            guidance_scale: float = 1.0,
+            negative_prompt: Optional[Union[str, List[str]]] = None,
+            negative_prompt_2: Optional[Union[str, List[str]]] = None,
+            num_images_per_prompt: Optional[int] = 1,
+            eta: float = 0.0,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.FloatTensor] = None,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+            pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+            negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+            ip_adapter_image: Optional[PipelineImageInput] = None,
+            output_type: Optional[str] = "pil",
+            return_dict: bool = True,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            guidance_rescale: float = 0.0,
+            original_size: Tuple[int, int] = None,
+            crops_coords_top_left: Tuple[int, int] = (0, 0),
+            target_size: Tuple[int, int] = None,
+            negative_original_size: Optional[Tuple[int, int]] = None,
+            negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+            negative_target_size: Optional[Tuple[int, int]] = None,
+            aesthetic_score: float = 6.0,
+            negative_aesthetic_score: float = 2.5,
+            clip_skip: Optional[int] = None,
+            callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+            callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+            num_inference_steps: int = 50,
+            inv_hp=None,
+            **kwargs,
+    ):
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            strength,
+            num_inversion_steps,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        denoising_start_fr = 1.0 - denoising_start
+        denoising_start = denoising_start
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._denoising_start = denoising_start
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image)
+        # 5. Prepare timesteps
+        def denoising_value_valid(dnv):
+            return isinstance(self.denoising_end, float) and 0 < dnv < 1
+        timesteps, num_inversion_steps = retrieve_timesteps(self.scheduler, num_inversion_steps, device, timesteps)
+        timesteps_num_inference_steps, num_inference_steps = retrieve_timesteps(self.scheduler_inference,
+                                                                                num_inference_steps, device, None)
+        timesteps, num_inversion_steps = self.get_timesteps(
+            num_inversion_steps,
+            strength,
+            device,
+            denoising_start=self.denoising_start if denoising_value_valid else None,
+        )
+        # latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # add_noise = True if self.denoising_start is None else False
+        # 6. Prepare latent variables
+        with torch.no_grad():
+            latents = self.prepare_latents(
+                image,
+                None,
+                batch_size,
+                num_images_per_prompt,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                False,
+            )
+        # 7. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        height, width = latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+        # 8. Prepare added time ids & embeddings
+        if negative_original_size is None:
+            negative_original_size = original_size
+        if negative_target_size is None:
+            negative_target_size = target_size
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            aesthetic_score,
+            negative_aesthetic_score,
+            negative_original_size,
+            negative_crops_coords_top_left,
+            negative_target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+            add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device)
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+                image_embeds = image_embeds.to(device)
+        # 9. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inversion_steps * self.scheduler.order, 0)
+        prev_timestep = None
+        self._num_timesteps = len(timesteps)
+        self.prev_z = torch.clone(latents)
+        self.prev_z4 = torch.clone(latents)
+        self.z_0 = torch.clone(latents)
+        g_cpu = torch.Generator().manual_seed(7865)
+        self.noise = randn_tensor(self.z_0.shape, generator=g_cpu, device=self.z_0.device, dtype=self.z_0.dtype)
+        # Friendly inversion params
+        timesteps_for = reversed(timesteps)
+        noise = randn_tensor(latents.shape, generator=g_cpu, device=latents.device, dtype=latents.dtype)
+        #latents = latents
+        z_T = latents.clone()
+        all_latents = [latents.clone()]
+        with self.progress_bar(total=num_inversion_steps) as progress_bar:
+            for i, t in enumerate(timesteps_for):
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+                z_tp1 = self.inversion_step(latents,
+                                            t,
+                                            prompt_embeds,
+                                            added_cond_kwargs,
+                                            prev_timestep=prev_timestep,
+                                            inv_hp=inv_hp,
+                                            z_0=self.z_0)
+                prev_timestep = t
+                latents = z_tp1
+                all_latents.append(latents.clone())
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        image = latents
+        # Offload all models
+        self.maybe_free_model_hooks()
+        return StableDiffusionXLPipelineOutput(images=image), all_latents
+    def get_timestamp_dist(self, z_0, timesteps):
+        timesteps = timesteps.to(z_0.device)
+        sigma = self.scheduler.sigmas.cuda()[:-1][self.scheduler.timesteps == timesteps]
+        z_0 = z_0.reshape(-1, 1)
+        def gaussian_pdf(x):
+            shape = x.shape
+            x = x.reshape(-1, 1)
+            all_probs = - 0.5 * torch.pow(((x - z_0) / sigma), 2)
+            return all_probs.reshape(shape)
+        return gaussian_pdf
+    # @torch.no_grad()
+    def inversion_step(
+            self,
+            z_t: torch.tensor,
+            t: torch.tensor,
+            prompt_embeds,
+            added_cond_kwargs,
+            prev_timestep: Optional[torch.tensor] = None,
+            inv_hp=None,
+            z_0=None,
+    ) -> torch.tensor:
+        n_iters, alpha, lr = inv_hp
+        latent = z_t
+        best_latent = None
+        best_score = torch.inf
+        curr_dist = self.get_timestamp_dist(z_0, t)
+        for i in range(n_iters):
+            latent.requires_grad = True
+            noise_pred = self.unet_pass(latent, t, prompt_embeds, added_cond_kwargs)
+            next_latent = self.backward_step(noise_pred, t, z_t, prev_timestep)
+            f_x = (next_latent - latent).abs() - alpha * curr_dist(next_latent)
+            score = f_x.mean()
+            if score < best_score:
+                best_score = score
+                best_latent = next_latent.detach()
+            f_x.sum().backward()
+            latent = latent - lr * (f_x / latent.grad)
+            latent.grad = None
+            latent._grad_fn = None
+        # if self.cfg.update_epsilon_type != Epsilon_Update_Type.NONE:
+        #     noise_pred = self.unet_pass(best_latent, t, prompt_embeds, added_cond_kwargs)
+        #     self.scheduler.step_and_update_noise(noise_pred, t, best_latent, z_t, return_dict=False,
+        #                                          update_epsilon_type=self.cfg.update_epsilon_type)
+        return best_latent
+    @torch.no_grad()
+    def unet_pass(self, z_t, t, prompt_embeds, added_cond_kwargs):
+        latent_model_input = torch.cat([z_t] * 2) if self.do_classifier_free_guidance else z_t
+        latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+        return self.unet(
+            latent_model_input,
+            t,
+            encoder_hidden_states=prompt_embeds,
+            timestep_cond=None,
+            cross_attention_kwargs=self.cross_attention_kwargs,
+            added_cond_kwargs=added_cond_kwargs,
+            return_dict=False,
+        )[0]
+    @torch.no_grad()
+    def backward_step(self, nosie_pred, t, z_t, prev_timestep):
+        extra_step_kwargs = {}
+        return self.scheduler.inv_step(nosie_pred, t, z_t, **extra_step_kwargs, return_dict=False)[0].detach()