StableZip

Runtime error

File size: 13,772 Bytes

13ff04b
 
 
 
bf2495a
 
13ff04b
 
 
 
bf2495a
87b4a1a
 
bf2495a
 
13ff04b
bf2495a
b47e7c8
 
bf2495a
312ce40
bf2495a
 
 
13ff04b
bf2495a
 
13ff04b
bf2495a
 
13ff04b
bf2495a
 
312ce40
13ff04b
 
 
 
 
312ce40
 
 
 
 
 
 
 
 
 
 
bf2495a
 
 
 
 
 
 
 
312ce40
 
 
 
 
 
 
 
 
 
 
 
bf2495a
 
312ce40
bf2495a
93b410f
13ff04b
 
 
 
 
312ce40
 
 
 
 
 
 
 
 
 
 
 
 
bf2495a
 
 
 
13ff04b
 
bf2495a
 
 
 
 
 
 
 
312ce40
bf2495a
312ce40
 
 
 
bf2495a
 
312ce40
bf2495a
93b410f
13ff04b
 
 
 
 
02742e5
bf2495a
 
 
 
 
 
 
 
 
 
 
 
 
13ff04b
 
bf2495a
b1a60cd
bf2495a
 
312ce40
bf2495a
312ce40
 
 
 
bf2495a
 
312ce40
 
bf2495a
 
312ce40
bf2495a
312ce40
bf2495a
312ce40
bf2495a
 
312ce40
bf2495a
312ce40
bf2495a
 
312ce40
882873e
 
 
312ce40
 
bf2495a
312ce40
 
bf2495a
312ce40
 
 
bf2495a
 
2aa88fe
 
13ff04b
60efe0d
312ce40
 
 
 
bf2495a
882873e
bf2495a
 
 
 
 
882873e
bf2495a
13ff04b
 
 
bf2495a
 
 
13ff04b
 
 
 
 
bf2495a
 
 
 
 
 
 
 
 
 
f493b13
bf2495a
312ce40
882873e
bf2495a
 
13ff04b
 
 
 
 
 
bf2495a
882873e
13ff04b
 
 
 
 
 
 
 
 
 
 
 
 
882873e
 
13ff04b
 
 
bf2495a
 
 
312ce40
882873e
bf2495a
 
 
 
9460541
bf2495a
13ff04b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9460541
 
 
 
312ce40
 
9460541
13ff04b
 
9460541
c68ee9c
 
 
9460541
 
 
1d9fd5c
 
 
 
 
 
 
 
9460541
1d9fd5c
9460541
1d9fd5c
 
c68ee9c
9460541
312ce40
c68ee9c
 
 
 
 
 
312ce40
c68ee9c
 
 
 
 
312ce40
 
c68ee9c
 
 
312ce40
 
c68ee9c
312ce40
c68ee9c
 
 
 
312ce40
c68ee9c
 
 
 
9460541
c68ee9c
 
 
312ce40
c68ee9c
 
 
 
 
 
 
312ce40
c68ee9c
312ce40

import spaces
from typing import Tuple, Union, List
import os

import numpy as np
from PIL import Image

import torch
from diffusers.pipelines.controlnet import StableDiffusionControlNetInpaintPipeline
from diffusers import ControlNetModel, UniPCMultistepScheduler, AutoPipelineForText2Image
from transformers import AutoImageProcessor, UperNetForSemanticSegmentation, AutoModelForDepthEstimation
from colors import ade_palette
from utils import map_colors_rgb
from diffusers import StableDiffusionXLPipeline
import gradio as gr
import gc

device = "cuda"
dtype = torch.float16

        
css = """
#img-display-container {
    max-height: 50vh;
    }
#img-display-input {
    max-height: 40vh;
    }
#img-display-output {
    max-height: 40vh;
    }
"""


def filter_items(
    colors_list: Union[List, np.ndarray],
    items_list: Union[List, np.ndarray],
    items_to_remove: Union[List, np.ndarray]
) -> Tuple[Union[List, np.ndarray], Union[List, np.ndarray]]:
    """
    Filters items and their corresponding colors from given lists, excluding
    specified items.
    Args:
        colors_list: A list or numpy array of colors corresponding to items.
        items_list: A list or numpy array of items.
        items_to_remove: A list or numpy array of items to be removed.
    Returns:
        A tuple of two lists or numpy arrays: filtered colors and filtered
        items.
    """
    filtered_colors = []
    filtered_items = []
    for color, item in zip(colors_list, items_list):
        if item not in items_to_remove:
            filtered_colors.append(color)
            filtered_items.append(item)
    return filtered_colors, filtered_items

def get_segmentation_pipeline(
) -> Tuple[AutoImageProcessor, UperNetForSemanticSegmentation]:
    """Method to load the segmentation pipeline
    Returns:
        Tuple[AutoImageProcessor, UperNetForSemanticSegmentation]: segmentation pipeline
    """
    image_processor = AutoImageProcessor.from_pretrained(
        "openmmlab/upernet-convnext-small"
    )
    image_segmentor = UperNetForSemanticSegmentation.from_pretrained(
        "openmmlab/upernet-convnext-small"
    )
    return image_processor, image_segmentor


@torch.inference_mode()
@spaces.GPU
def segment_image(
        image: Image,
        image_processor: AutoImageProcessor,
        image_segmentor: UperNetForSemanticSegmentation
) -> Image:
    """
    Segments an image using a semantic segmentation model.
    Args:
        image (Image): The input image to be segmented.
        image_processor (AutoImageProcessor): The processor to prepare the
            image for segmentation.
        image_segmentor (UperNetForSemanticSegmentation): The semantic
            segmentation model used to identify different segments in the image.
    Returns:
        Image: The segmented image with each segment colored differently based
            on its identified class.
    """
    # image_processor, image_segmentor = get_segmentation_pipeline()
    pixel_values = image_processor(image, return_tensors="pt").pixel_values
    with torch.no_grad():
        outputs = image_segmentor(pixel_values)

    seg = image_processor.post_process_semantic_segmentation(
        outputs, target_sizes=[image.size[::-1]])[0]
    color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
    palette = np.array(ade_palette())
    for label, color in enumerate(palette):
        color_seg[seg == label, :] = color
    color_seg = color_seg.astype(np.uint8)
    seg_image = Image.fromarray(color_seg).convert('RGB')
    return seg_image


def get_depth_pipeline():
    feature_extractor = AutoImageProcessor.from_pretrained("LiheYoung/depth-anything-large-hf",
                                                           torch_dtype=dtype)
    depth_estimator = AutoModelForDepthEstimation.from_pretrained("LiheYoung/depth-anything-large-hf",
                                                                  torch_dtype=dtype)
    return feature_extractor, depth_estimator


@torch.inference_mode()
@spaces.GPU
def get_depth_image(
        image: Image,
        feature_extractor: AutoImageProcessor,
        depth_estimator: AutoModelForDepthEstimation
) -> Image:
    image_to_depth = feature_extractor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        depth_map = depth_estimator(**image_to_depth).predicted_depth

    width, height = image.size
    depth_map = torch.nn.functional.interpolate(
        depth_map.unsqueeze(1).float(),
        size=(height, width),
        mode="bicubic",
        align_corners=False,
    )
    depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
    depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
    depth_map = (depth_map - depth_min) / (depth_max - depth_min)
    image = torch.cat([depth_map] * 3, dim=1)

    image = image.permute(0, 2, 3, 1).cpu().numpy()[0]
    image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8))
    return image


def resize_dimensions(dimensions, target_size):
    """ 
    Resize PIL to target size while maintaining aspect ratio 
    If smaller than target size leave it as is
    """
    width, height = dimensions

    # Check if both dimensions are smaller than the target size
    if width < target_size and height < target_size:
        return dimensions

    # Determine the larger side
    if width > height:
        # Calculate the aspect ratio
        aspect_ratio = height / width
        # Resize dimensions
        return (target_size, int(target_size * aspect_ratio))
    else:
        # Calculate the aspect ratio
        aspect_ratio = width / height
        # Resize dimensions
        return (int(target_size * aspect_ratio), target_size)


def flush():
    gc.collect()
    torch.cuda.empty_cache()
    
    
class ControlNetDepthDesignModelMulti:
    """ Produces random noise images """
    
    def __init__(self):
        """ Initialize your model(s) here """
        #os.environ['HF_HUB_OFFLINE'] = "True"
        
        self.seed = 323*111
        self.neg_prompt = "window, door, low resolution, banner, logo, watermark, text, deformed, blurry, out of focus, surreal, ugly, beginner"
        self.control_items = ["ugly", "ugly"]
        self.additional_quality_suffix = "4K, high resolution, photorealistic"
        
    @spaces.GPU
    def generate_design(self, empty_room_image: Image, prompt: str, guidance_scale: int = 10, num_steps: int = 50, strength: float =0.9, img_size: int = 640) -> Image:
        """
        Given an image.
        """
        print(prompt)
        flush()
        self.generator = torch.Generator(device=device).manual_seed(self.seed)

        pos_prompt = prompt + f', {self.additional_quality_suffix}'

        orig_w, orig_h = empty_room_image.size
        new_width, new_height = resize_dimensions(empty_room_image.size, img_size)
        input_image = empty_room_image.resize((new_width, new_height))
        real_seg = np.array(segment_image(input_image,
                                          seg_image_processor,
                                          image_segmentor))
        unique_colors = np.unique(real_seg.reshape(-1, real_seg.shape[2]), axis=0)
        unique_colors = [tuple(color) for color in unique_colors]
        segment_items = [map_colors_rgb(i) for i in unique_colors]
        chosen_colors, segment_items = filter_items(
            colors_list=unique_colors,
            items_list=segment_items,
            items_to_remove=self.control_items
        )
        mask = np.zeros_like(real_seg)
        for color in chosen_colors:
            color_matches = (real_seg == color).all(axis=2)
            mask[color_matches] = 1

        image_np = np.array(input_image)
        image = Image.fromarray(image_np).convert("RGB")
        mask_image = Image.fromarray((mask * 255).astype(np.uint8)).convert("RGB")
        segmentation_cond_image = Image.fromarray(real_seg).convert("RGB")

        image_depth = get_depth_image(image, depth_feature_extractor, depth_estimator)

        # generate image that would be used as IP-adapter
        flush()
        new_width_ip = int(new_width / 8) * 8
        new_height_ip = int(new_height / 8) * 8
        ip_image = guide_pipe(pos_prompt,
                                   num_inference_steps=num_steps,
                                   negative_prompt=self.neg_prompt,
                                   height=new_height_ip,
                                   width=new_width_ip,
                                   generator=[self.generator]).images[0]

        flush()
        generated_image = pipe(
            prompt=pos_prompt,
            negative_prompt=self.neg_prompt,
            num_inference_steps=num_steps,
            strength=strength,
            guidance_scale=guidance_scale,
            generator=[self.generator],
            image=image,
            mask_image=mask_image,
            ip_adapter_image=ip_image,
            control_image=[image_depth, segmentation_cond_image],
            controlnet_conditioning_scale=[0.5, 0.5]
        ).images[0]
        
        flush()
        design_image = generated_image.resize(
            (orig_w, orig_h), Image.Resampling.LANCZOS
        )
        
        return design_image


def create_demo(model):
    gr.Markdown("### Stable Design demo")
    with gr.Row():
        with gr.Column():
            input_image = gr.Image(label="Input Image", type='pil', elem_id='img-display-input')
            input_text = gr.Textbox(label='Prompt', value="high resolution, clay render style , grayscale", lines=2)
            with gr.Accordion('Advanced options', open=False):
                num_steps = gr.Slider(label='Steps',
                                      minimum=1,
                                      maximum=50,
                                      value=50,
                                      step=1)
                img_size = gr.Slider(label='Image size',
                                      minimum=256,
                                      maximum=768,
                                      value=768,
                                      step=64)
                guidance_scale = gr.Slider(label='Guidance Scale',
                                           minimum=0.1,
                                           maximum=30.0,
                                           value=10.0,
                                           step=0.1)
                seed = gr.Slider(label='Seed',
                                 minimum=-1,
                                 maximum=2147483647,
                                 value=323*111,
                                 step=1,
                                 randomize=True)
                strength = gr.Slider(label='Strength',
                                           minimum=0.1,
                                           maximum=1.0,
                                           value=0.9,
                                           step=0.1)
                a_prompt = gr.Textbox(
                    label='Added Prompt',
                    value="8K, high resolution, photorealistic")
                n_prompt = gr.Textbox(
                    label='Negative Prompt',
                    value=" low resolution, banner, logo, watermark, deformed, blurry, out of focus, surreal, ugly, beginner")
            submit = gr.Button("Submit")
        
        with gr.Column():
            design_image = gr.Image(label="Output Mask", elem_id='img-display-output')
    
    
    def on_submit(image, text, num_steps, guidance_scale, seed, strength, a_prompt, n_prompt, img_size):
        model.seed = seed
        model.neg_prompt = n_prompt
        model.additional_quality_suffix = a_prompt
        
        with torch.no_grad():
            out_img = model.generate_design(image, text, guidance_scale=guidance_scale, num_steps=num_steps, strength=strength, img_size=img_size)

        return out_img

    submit.click(on_submit, inputs=[input_image, input_text, num_steps, guidance_scale, seed, strength, a_prompt, n_prompt, img_size], outputs=design_image)
    examples = gr.Examples(examples=[["imgs/bedroom_1.jpg"]],
                           inputs=[input_image, input_text], cache_examples=False)


controlnet_depth= ControlNetModel.from_pretrained(
    "controlnet_depth", torch_dtype=dtype, use_safetensors=True)
controlnet_seg = ControlNetModel.from_pretrained(
    "own_controlnet", torch_dtype=dtype, use_safetensors=True)

pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
    "SG161222/Realistic_Vision_V5.1_noVAE",
    #"models/runwayml--stable-diffusion-inpainting",
    controlnet=[controlnet_depth, controlnet_seg],
    safety_checker=None,
    torch_dtype=dtype
)

pipe.load_ip_adapter("h94/IP-Adapter", subfolder="models",
                     weight_name="ip-adapter_sd15.bin")
pipe.set_ip_adapter_scale(0.4)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to(device)
guide_pipe = StableDiffusionXLPipeline.from_pretrained("segmind/SSD-1B",
                                                       torch_dtype=dtype, use_safetensors=True, variant="fp16")
guide_pipe = guide_pipe.to(device)
   
seg_image_processor, image_segmentor = get_segmentation_pipeline()
depth_feature_extractor, depth_estimator = get_depth_pipeline()
depth_estimator = depth_estimator.to(device)


def main():
    model = ControlNetDepthDesignModelMulti()
    print('Models uploaded successfully')
    
    title = "# StableDesign"
    description = """
    WELCOME
    """
    with gr.Blocks() as demo:
        gr.Markdown(title)
        gr.Markdown(description)
        
        create_demo(model)

    demo.queue().launch(share=False)


if __name__ == '__main__':
    main()