Spaces:

jbilcke-hf
/

image-upscaling-api

Paused

File size: 8,983 Bytes

# import spaces
import os
import datetime
import einops
import gradio as gr
import numpy as np
import torch
import random
from PIL import Image
from pathlib import Path
from torchvision import transforms
import torch.nn.functional as F
from torchvision.models import resnet50, ResNet50_Weights

from pytorch_lightning import seed_everything
from transformers import CLIPTextModel, CLIPTokenizer, CLIPImageProcessor
from diffusers import AutoencoderKL, DDIMScheduler, PNDMScheduler, DPMSolverMultistepScheduler, UniPCMultistepScheduler

from pipelines.pipeline_pasd import StableDiffusionControlNetPipeline
from myutils.misc import load_dreambooth_lora, rand_name
from myutils.wavelet_color_fix import wavelet_color_fix
from annotator.retinaface import RetinaFaceDetection

from io import BytesIO
import base64
import re

SECRET_TOKEN = os.getenv('SECRET_TOKEN', 'default_secret')

# Regex pattern to match data URI scheme
data_uri_pattern = re.compile(r'data:image/(png|jpeg|jpg|webp);base64,')

def readb64(b64):
    # Remove any data URI scheme prefix with regex
    b64 = data_uri_pattern.sub("", b64)
    # Decode and open the image with PIL
    img = Image.open(BytesIO(base64.b64decode(b64)))
    return img
    
# convert from PIL to base64
def writeb64(image):
    buffered = BytesIO()
    image.save(buffered, format="PNG")
    b64image = base64.b64encode(buffered.getvalue())
    b64image_str = b64image.decode("utf-8")
    return b64image_str

use_pasd_light = False
face_detector = RetinaFaceDetection()

if use_pasd_light:
    from models.pasd_light.unet_2d_condition import UNet2DConditionModel
    from models.pasd_light.controlnet import ControlNetModel
else:
    from models.pasd.unet_2d_condition import UNet2DConditionModel
    from models.pasd.controlnet import ControlNetModel

pretrained_model_path = "checkpoints/stable-diffusion-v1-5"
ckpt_path = "runs/pasd/checkpoint-100000"
#dreambooth_lora_path = "checkpoints/personalized_models/toonyou_beta3.safetensors"
dreambooth_lora_path = "checkpoints/personalized_models/majicmixRealistic_v6.safetensors"
#dreambooth_lora_path = "checkpoints/personalized_models/Realistic_Vision_V5.1.safetensors"
weight_dtype = torch.float16
device = "cuda"

scheduler = UniPCMultistepScheduler.from_pretrained(pretrained_model_path, subfolder="scheduler")
text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder")
tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae")
feature_extractor = CLIPImageProcessor.from_pretrained(f"{pretrained_model_path}/feature_extractor")
unet = UNet2DConditionModel.from_pretrained(ckpt_path, subfolder="unet")
controlnet = ControlNetModel.from_pretrained(ckpt_path, subfolder="controlnet")
vae.requires_grad_(False)
text_encoder.requires_grad_(False)
unet.requires_grad_(False)
controlnet.requires_grad_(False)

unet, vae, text_encoder = load_dreambooth_lora(unet, vae, text_encoder, dreambooth_lora_path)

text_encoder.to(device, dtype=weight_dtype)
vae.to(device, dtype=weight_dtype)
unet.to(device, dtype=weight_dtype)
controlnet.to(device, dtype=weight_dtype)

validation_pipeline = StableDiffusionControlNetPipeline(
        vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, feature_extractor=feature_extractor, 
        unet=unet, controlnet=controlnet, scheduler=scheduler, safety_checker=None, requires_safety_checker=False,
    )
#validation_pipeline.enable_vae_tiling()
validation_pipeline._init_tiled_vae(decoder_tile_size=224)

weights = ResNet50_Weights.DEFAULT
preprocess = weights.transforms()
resnet = resnet50(weights=weights)
resnet.eval()

def resize_image(img, target_height):
    # Calculate the ratio to resize the image to the target height
    ratio = target_height / float(img.size[1])
    # Calculate the new width based on the aspect ratio
    new_width = int(float(img.size[0]) * ratio)
    # Resize the image
    resized_img = img.resize((new_width, target_height), Image.LANCZOS)
    # Save the resized image
    #resized_img.save(output_path)
    return resized_img

# @spaces.GPU(enable_queue=True)
def inference(secret_token, input_image_b64, prompt, a_prompt, n_prompt, denoise_steps, upscale, alpha, cfg, seed):
    if secret_token != SECRET_TOKEN:
        raise gr.Error(
            f'Invalid secret token. Please fork the original space if you want to use it for yourself.')

    input_image = readb64(input_image_b64)
    
    input_image = resize_image(input_image, 512)
    
    process_size = 768
    
    resize_preproc = transforms.Compose([
        transforms.Resize(process_size, interpolation=transforms.InterpolationMode.BILINEAR),
    ])
    
    # Get the current timestamp
    timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

    with torch.no_grad():
        seed_everything(seed)
        generator = torch.Generator(device=device)

        input_image = input_image.convert('RGB')
        batch = preprocess(input_image).unsqueeze(0)
        prediction = resnet(batch).squeeze(0).softmax(0)
        class_id = prediction.argmax().item()
        score = prediction[class_id].item()
        category_name = weights.meta["categories"][class_id]
        if score >= 0.1:
            prompt += f"{category_name}" if prompt=='' else f", {category_name}"

        prompt = a_prompt if prompt=='' else f"{prompt}, {a_prompt}"

        ori_width, ori_height = input_image.size
        resize_flag = False

        rscale = upscale
        input_image = input_image.resize((input_image.size[0]*rscale, input_image.size[1]*rscale))
        
        #if min(validation_image.size) < process_size:
        #    validation_image = resize_preproc(validation_image)

        input_image = input_image.resize((input_image.size[0]//8*8, input_image.size[1]//8*8))
        width, height = input_image.size
        resize_flag = True #

        try:
            image = validation_pipeline(
                    None, prompt, input_image, num_inference_steps=denoise_steps, generator=generator, height=height, width=width, guidance_scale=cfg, 
                    negative_prompt=n_prompt, conditioning_scale=alpha, eta=0.0,
                ).images[0]
            
            if True: #alpha<1.0:
                image = wavelet_color_fix(image, input_image)
        
            if resize_flag: 
                image = image.resize((ori_width*rscale, ori_height*rscale))
        except Exception as e:
            print(e)
            image = Image.new(mode="RGB", size=(512, 512))
    
    return writeb64(image)


with gr.Blocks() as demo:
    with gr.Column():
        with gr.Row():
            with gr.Column():
                secret_token = gr.Textbox()
                input_image_b64 = gr.Textbox()
                prompt_in = gr.Textbox(label="Prompt", value="Frog")
                with gr.Accordion(label="Advanced settings", open=False):
                    added_prompt = gr.Textbox(label="Added Prompt", value='clean, high-resolution, 8k, best quality, masterpiece')
                    neg_prompt = gr.Textbox(label="Negative Prompt",value='dotted, noise, blur, lowres, oversmooth, longbody, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
                    denoise_steps = gr.Slider(label="Denoise Steps", minimum=10, maximum=50, value=20, step=1)
                    upsample_scale = gr.Slider(label="Upsample Scale", minimum=1, maximum=4, value=2, step=1)
                    condition_scale = gr.Slider(label="Conditioning Scale", minimum=0.5, maximum=1.5, value=1.1, step=0.1)
                    classifier_free_guidance = gr.Slider(label="Classier-free Guidance", minimum=0.1, maximum=10.0, value=7.5, step=0.1)
                    seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
                submit_btn = gr.Button("Submit")
            with gr.Column():
                output_image_b64 = gr.Textbox()
    gr.HTML("""
        <div style="z-index: 100; position: fixed; top: 0px; right: 0px; left: 0px; bottom: 0px; width: 100%; height: 100%; background: white; display: flex; align-items: center; justify-content: center; color: black;">
        <div style="text-align: center; color: black;">
        <p style="color: black;">This space is a REST API to programmatically upscale an image.</p>
        <p style="color: black;">Interested in using it? Please use the <a href="https://huggingface.co/spaces/fffiloni/PASD" target="_blank">original space</a>, thank you!</p>
        </div>
        </div>""")
    
    submit_btn.click(
        fn = inference,
        inputs = [
            secret_token,
            input_image_b64, prompt_in,
            added_prompt, neg_prompt,
            denoise_steps,
            upsample_scale, condition_scale,
            classifier_free_guidance, seed
        ],
        outputs = output_image_b64
    )
demo.queue().launch()