Spaces:

RishabA
/

celeba-latent-diffusion

Running on Zero

App Files Files Community

RishabA commited on 25 days ago

Commit

9835792

verified ·

1 Parent(s): b2cf8f4

Update model.py

Browse files

Files changed (1) hide show

model.py +1 -150

model.py CHANGED Viewed

@@ -1672,153 +1672,4 @@ class UNet(nn.Module):
             out
         )  # (batch_size, self.conv_out_channels, h, w) -> (batch_size, image_channels, h, w)
-        return out  # (batch_size, image_channels, h, w)
-def sample_ddpm_inference(
-    unet,
-    vae,
-    text_prompt,
-    mask_image_pil=None,
-    guidance_scale=1.0,
-    device=torch.device("cpu"),
-):
-    """
-    Given a text prompt and (optionally) an image condition (as a PIL image),
-    sample from the diffusion model and return a generated image (PIL image).
-    """
-    # Create noise scheduler
-    scheduler = LinearNoiseScheduler(
-        num_timesteps=diffusion_params["num_timesteps"],
-        beta_start=diffusion_params["beta_start"],
-        beta_end=diffusion_params["beta_end"],
-    )
-    # Get conditioning config from ldm_params
-    condition_config = ldm_params.get("condition_config", None)
-    condition_types = (
-        condition_config.get("condition_types", [])
-        if condition_config is not None
-        else []
-    )
-    # Load text tokenizer/model for conditioning
-    text_model_type = condition_config["text_condition_config"]["text_embed_model"]
-    text_tokenizer, text_model = get_tokenizer_and_model(text_model_type, device=device)
-    # Get empty text representation for classifier-free guidance
-    empty_text_embed = get_text_representation([""], text_tokenizer, text_model, device)
-    # Get text representation of the input prompt
-    text_prompt_embed = get_text_representation(
-        [text_prompt], text_tokenizer, text_model, device
-    )
-    # Prepare image conditioning:
-    # If the user uploaded a mask image (should be a PIL image), convert it; otherwise, use zeros.
-    if "image" in condition_types:
-        if mask_image_pil is not None:
-            mask_transform = transforms.Compose(
-                [
-                    transforms.Resize(
-                        (
-                            ldm_params["condition_config"]["image_condition_config"][
-                                "image_condition_h"
-                            ],
-                            ldm_params["condition_config"]["image_condition_config"][
-                                "image_condition_w"
-                            ],
-                        )
-                    ),
-                    transforms.ToTensor(),
-                ]
-            )
-            mask_tensor = (
-                mask_transform(mask_image_pil).unsqueeze(0).to(device)
-            )  # (1, channels, H, W)
-        else:
-            # Create a zero mask with the required number of channels (e.g. 18)
-            ic = ldm_params["condition_config"]["image_condition_config"][
-                "image_condition_input_channels"
-            ]
-            H = ldm_params["condition_config"]["image_condition_config"][
-                "image_condition_h"
-            ]
-            W = ldm_params["condition_config"]["image_condition_config"][
-                "image_condition_w"
-            ]
-            mask_tensor = torch.zeros((1, ic, H, W), device=device)
-    else:
-        mask_tensor = None
-    # Build conditioning dictionaries for classifier-free guidance:
-    # For unconditional, we use empty text and zero mask.
-    uncond_input = {}
-    cond_input = {}
-    if "text" in condition_types:
-        uncond_input["text"] = empty_text_embed
-        cond_input["text"] = text_prompt_embed
-    if "image" in condition_types:
-        # Use zeros for unconditioning, and the provided mask for conditioning.
-        uncond_input["image"] = torch.zeros_like(mask_tensor)
-        cond_input["image"] = mask_tensor
-    # Load the diffusion UNet (and assume it has been pretrained and saved)
-    # unet = UNet(
-    #     image_channels=autoencoder_params["z_channels"], model_config=ldm_params
-    # ).to(device)
-    # ldm_checkpoint_path = os.path.join(
-    #     train_params["task_name"], train_params["ldm_ckpt_name"]
-    # )
-    # if os.path.exists(ldm_checkpoint_path):
-    #     checkpoint = torch.load(ldm_checkpoint_path, map_location=device)
-    #     unet.load_state_dict(checkpoint["model_state_dict"])
-    # unet.eval()
-    # Load VQVAE (assume pretrained and saved)
-    # vae = VQVAE(
-    #     image_channels=dataset_params["image_channels"], model_config=autoencoder_params
-    # ).to(device)
-    # vae_checkpoint_path = os.path.join(
-    #     train_params["task_name"], train_params["vqvae_autoencoder_ckpt_name"]
-    # )
-    # if os.path.exists(vae_checkpoint_path):
-    #     checkpoint = torch.load(vae_checkpoint_path, map_location=device)
-    #     vae.load_state_dict(checkpoint["model_state_dict"])
-    # vae.eval()
-    # Determine latent shape from VQVAE: (batch, z_channels, H_lat, W_lat)
-    # For example, if image_size is 256 and there are 3 downsamplings, H_lat = 256 // 8 = 32.
-    latent_size = dataset_params["image_size"] // (
-        2 ** sum(autoencoder_params["down_sample"])
-    )
-    batch = train_params["num_samples"]
-    z_channels = autoencoder_params["z_channels"]
-    # Sample initial latent noise
-    xt = torch.randn((batch, z_channels, latent_size, latent_size), device=device)
-    # Sampling loop (reverse diffusion)
-    T = diffusion_params["num_timesteps"]
-    for i in reversed(range(T)):
-        t = torch.full((batch,), i, dtype=torch.long, device=device)
-        # Get conditional noise prediction
-        noise_pred_cond = unet(xt, t, cond_input)
-        if guidance_scale > 1:
-            noise_pred_uncond = unet(xt, t, uncond_input)
-            noise_pred = noise_pred_uncond + guidance_scale * (
-                noise_pred_cond - noise_pred_uncond
-            )
-        else:
-            noise_pred = noise_pred_cond
-        xt, _ = scheduler.sample_prev_timestep(xt, noise_pred, t)
-        with torch.no_grad():
-            generated = vae.decode(xt)
-        generated = torch.clamp(generated, -1, 1)
-        generated = (generated + 1) / 2  # scale to [0,1]
-        grid = make_grid(generated, nrow=1)
-        pil_img = transforms.ToPILImage()(grid.cpu())
-        if i % 10 == 0:
-            yield pil_img

             out
         )  # (batch_size, self.conv_out_channels, h, w) -> (batch_size, image_channels, h, w)
+        return out  # (batch_size, image_channels, h, w)