import gradio as gr
import torch
from torchvision import transforms
from SDXL.diff_pipe import StableDiffusionXLDiffImg2ImgPipeline
from diffusers import DPMSolverMultistepScheduler

# DepthAnything
import cv2
import numpy as np
import os
from PIL import Image
import torch.nn.functional as F
from torchvision.transforms import Compose
import tempfile
from gradio_imageslider import ImageSlider
from depth_anything.depth_anything.dpt import DepthAnything
from depth_anything.depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet

NUM_INFERENCE_STEPS = 50
dtype = torch.float16
if torch.cuda.is_available():
  DEVICE = "cuda"
elif torch.backends.mps.is_available():
  DEVICE = "mps"
  dtype = torch.float32
else:
  DEVICE = "cpu"
#device = "cuda"

encoder = 'vitl' # can also be 'vitb' or 'vitl'
model = DepthAnything.from_pretrained(f"LiheYoung/depth_anything_{encoder}14").to(DEVICE).eval()

base = StableDiffusionXLDiffImg2ImgPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=dtype, variant="fp16", use_safetensors=True
)

refiner = StableDiffusionXLDiffImg2ImgPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-refiner-1.0",
    text_encoder_2=base.text_encoder_2,
    vae=base.vae,
    torch_dtype=dtype,
    use_safetensors=True,
    variant="fp16",
)

base.scheduler = DPMSolverMultistepScheduler.from_config(base.scheduler.config)
refiner.scheduler = DPMSolverMultistepScheduler.from_config(base.scheduler.config)


# DepthAnything

transform = Compose([
        Resize(
            width=518,
            height=518,
            resize_target=False,
            keep_aspect_ratio=True,
            ensure_multiple_of=14,
            resize_method='lower_bound',
            image_interpolation_method=cv2.INTER_CUBIC,
        ),
        NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        PrepareForNet(),
])

@torch.no_grad()
def predict_depth(model, image):
    return model(image)

def depthify(image):
    original_image = image.copy()
    h, w = image.shape[:2]
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
    image = transform({'image': image})['image']
    image = torch.from_numpy(image).unsqueeze(0).to(DEVICE)
    depth = predict_depth(model, image)
    depth = F.interpolate(depth[None], (h, w), mode='bilinear', align_corners=False)[0, 0]
    raw_depth = Image.fromarray(depth.cpu().numpy().astype('uint8'))
    tmp = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
    raw_depth.save(tmp.name)
    depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
    depth = depth.cpu().numpy().astype(np.uint8)
    colored_depth = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)[:, :, ::-1]
    return [(original_image, colored_depth), tmp.name, raw_depth]


# DifferentialDiffusion

def preprocess_image(image_array):
    image = Image.fromarray(image_array)
    image = image.convert("RGB")
    image = transforms.CenterCrop((image.size[1] // 64 * 64, image.size[0] // 64 * 64))(image)
    image = transforms.ToTensor()(image)
    image = image * 2 - 1
    image = image.unsqueeze(0).to(DEVICE)
    return image


def preprocess_map(map):
    map = map.convert("L")
    map = transforms.CenterCrop((map.size[1] // 64 * 64, map.size[0] // 64 * 64))(map)
    # convert to tensor
    map = transforms.ToTensor()(map)
    map = map.to(DEVICE)
    return map


def inference(
    image,
    map,
    guidance_scale,
    prompt,
    negative_prompt,
    steps,
    denoising_start,
    denoising_end
):
    validate_inputs(image, map)
    image = preprocess_image(image)
    map = preprocess_map(map)
    base_device = base.to(DEVICE)
    edited_images = base_device(
        prompt=prompt,
        original_image=image,
        image=image,
        strength=1,
        guidance_scale=guidance_scale,
        num_images_per_prompt=1,
        negative_prompt=negative_prompt,
        map=map,
        num_inference_steps=steps,
        denoising_end=denoising_end,
        output_type="latent"
    ).images
    base_device=None
    refiner_device = refiner.to(DEVICE)
    edited_images = refiner_device(
        prompt=prompt,
        original_image=image,
        image=edited_images,
        strength=1,
        guidance_scale=guidance_scale,
        num_images_per_prompt=1,
        negative_prompt=negative_prompt,
        map=map,
        num_inference_steps=steps,
        denoising_start=denoising_start
    ).images[0]
    refiner_device=None
    return edited_images


def validate_inputs(image, map):
    if image is None:
        raise gr.Error("Missing image")
    if map is None:
        raise gr.Error("Missing map")


def run(image, gs, prompt, neg_prompt, steps, denoising_start, denoising_end):
    # first run 
    [(original_image, colored_depth), name, raw_depth] = depthify(image)
    print(f"original_image={original_image} colored_depth={colored_depth}, name={name}, raw_depth={raw_depth}")
    return raw_depth, inference(original_image, raw_depth, gs, prompt, neg_prompt, steps, denoising_start, denoising_end)

with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            with gr.Row():
                input_image = gr.Image(label="Input Image")
#                change_map = gr.Image(label="Change Map", type="pil")
            gs = gr.Slider(0, 28, value=7.5, label="Guidance Scale")
            steps = gr.Number(value=50, label="Steps")
            denoising_start = gr.Slider(0, 1, value=0.8, label="Denoising Start")
            denoising_end = gr.Slider(0, 1, value=0.8, label="Denoising End")
            prompt = gr.Textbox(label="Prompt")
            neg_prompt = gr.Textbox(label="Negative Prompt")
            with gr.Row():
#                clr_btn=gr.ClearButton(components=[input_image, change_map, gs, prompt, neg_prompt])
                clr_btn=gr.ClearButton(components=[input_image, gs, prompt, neg_prompt, steps, denoising_start, denoising_end])
                run_btn = gr.Button("Run",variant="primary")

        with gr.Column():
            output = gr.Image(label="Output Image")
            change_map = gr.Image(label="Change Map")
    run_btn.click(
      run,
      #inference,
      inputs=[input_image, gs, prompt, neg_prompt, steps, denoising_start, denoising_end],
      outputs=[change_map, output]
    )
    clr_btn.add(output)
if __name__ == "__main__":
    demo.launch()