import gradio as gr import torch from torchvision import transforms from SDXL.diff_pipe import StableDiffusionXLDiffImg2ImgPipeline from diffusers import DPMSolverMultistepScheduler # DepthAnything import cv2 import numpy as np import os from PIL import Image import torch.nn.functional as F from torchvision.transforms import Compose import tempfile from gradio_imageslider import ImageSlider from .depth_anything.depth_anything.dpt import DepthAnything from .depth_anything.depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet NUM_INFERENCE_STEPS = 50 dtype = torch.float16 if torch.cuda.is_available(): device = "cuda" elif torch.backends.mps.is_available(): device = "mps" dtype = torch.float32 else: device = "cpu" #device = "cuda" encoder = 'vitl' # can also be 'vitb' or 'vitl' model = DepthAnything.from_pretrained(f"LiheYoung/depth_anything_{encoder}14").to(DEVICE).eval() base = StableDiffusionXLDiffImg2ImgPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=dtype, variant="fp16", use_safetensors=True ) refiner = StableDiffusionXLDiffImg2ImgPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-refiner-1.0", text_encoder_2=base.text_encoder_2, vae=base.vae, torch_dtype=dtype, use_safetensors=True, variant="fp16", ) base.scheduler = DPMSolverMultistepScheduler.from_config(base.scheduler.config) refiner.scheduler = DPMSolverMultistepScheduler.from_config(base.scheduler.config) # DepthAnything @torch.no_grad() def predict_depth(model, image): return model(image) def depthify(image): original_image = image.copy() h, w = image.shape[:2] image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0 image = transform({'image': image})['image'] image = torch.from_numpy(image).unsqueeze(0).to(DEVICE) depth = predict_depth(model, image) depth = F.interpolate(depth[None], (h, w), mode='bilinear', align_corners=False)[0, 0] raw_depth = Image.fromarray(depth.cpu().numpy().astype('uint8')) tmp = tempfile.NamedTemporaryFile(suffix='.png', delete=False) raw_depth.save(tmp.name) depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 depth = depth.cpu().numpy().astype(np.uint8) colored_depth = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)[:, :, ::-1] return [(original_image, colored_depth), tmp.name, raw_depth] # DifferentialDiffusion def preprocess_image(image): image = image.convert("RGB") image = transforms.CenterCrop((image.size[1] // 64 * 64, image.size[0] // 64 * 64))(image) image = transforms.ToTensor()(image) image = image * 2 - 1 image = image.unsqueeze(0).to(device) return image def preprocess_map(map): map = map.convert("L") map = transforms.CenterCrop((map.size[1] // 64 * 64, map.size[0] // 64 * 64))(map) # convert to tensor map = transforms.ToTensor()(map) map = map.to(device) return map def inference(image, map, gs, prompt, negative_prompt): validate_inputs(image, map) image = preprocess_image(image) map = preprocess_map(map) base_cuda = base.to(device) edited_images = base_cuda(prompt=prompt, original_image=image, image=image, strength=1, guidance_scale=gs, num_images_per_prompt=1, negative_prompt=negative_prompt, map=map, num_inference_steps=NUM_INFERENCE_STEPS, denoising_end=0.8, output_type="latent").images base_cuda=None refiner_cuda = refiner.to(device) edited_images = refiner_cuda(prompt=prompt, original_image=image, image=edited_images, strength=1, guidance_scale=7.5, num_images_per_prompt=1, negative_prompt=negative_prompt, map=map, num_inference_steps=NUM_INFERENCE_STEPS, denoising_start=0.8).images[0] refiner_cuda=None return edited_images def validate_inputs(image, map): if image is None: raise gr.Error("Missing image") if map is None: raise gr.Error("Missing map") def run(image, gs, prompt, neg_prompt): # first run [(original_image, colored_depth), name, raw_depth] = depthify(image) print(f"original_image={original_image} colored_depth={colored_depth}, name={name}, raw_depth={raw_depth}") return inference(original_image, raw_depth, gs, prompt, neg_prompt) with gr.Blocks() as demo: with gr.Row(): with gr.Column(): with gr.Row(): input_image = gr.Image(label="Input Image", type="pil") change_map = gr.Image(label="Change Map", type="pil") gs = gr.Slider(0, 28, value=7.5, label="Guidance Scale") prompt = gr.Textbox(label="Prompt") neg_prompt = gr.Textbox(label="Negative Prompt") with gr.Row(): clr_btn=gr.ClearButton(components=[input_image, change_map, gs, prompt, neg_prompt]) run_btn = gr.Button("Run",variant="primary") output = gr.Image(label="Output Image") run_btn.click( run, #inference, inputs=[input_image, change_map, gs, prompt, neg_prompt], outputs=output ) clr_btn.add(output) if __name__ == "__main__": demo.launch()