Spaces:

wusize
/

Harmon

Running on Zero

File size: 9,363 Bytes

import gradio as gr
import torch
from transformers import AutoConfig
from transformers import AutoTokenizer, AutoModel
from PIL import Image

import numpy as np
import spaces  # Import spaces for ZeroGPU compatibility
from einops import rearrange


PROMPT_TEMPLATE = dict(
    SYSTEM='<|im_start|>system\n{system}<|im_end|>\n',
    INSTRUCTION='<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n',
    SUFFIX='<|im_end|>',
    SUFFIX_AS_EOS=True,
    SEP='\n',
    STOP_WORDS=['<|im_end|>', '<|endoftext|>'])

GENERATION_TEMPLATE = "Generate an image: {text}"


model_path = "wusize/Harmon-1_5B"
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
llm_config = config.llm
llm_config['_attn_implementation'] = 'eager'
harmon_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
harmon_model = AutoModel.from_pretrained(model_path, llm=llm_config,
                                         trust_remote_code=True).eval()

special_tokens_dict = {'additional_special_tokens': ["<image>", ]}
num_added_toks = harmon_tokenizer.add_special_tokens(special_tokens_dict)
assert num_added_toks == 1

image_token_idx = harmon_tokenizer.encode("<image>", add_special_tokens=False)[-1]
print(f"Image token: {harmon_tokenizer.decode(image_token_idx)}", flush=True)

if torch.cuda.is_available():
    harmon_model = harmon_model.to(torch.bfloat16).cuda()
else:
    harmon_model = harmon_model.to(torch.float32)


def expand2square(pil_img, background_color):
    width, height = pil_img.size
    if width == height:
        return pil_img
    elif width > height:
        result = Image.new(pil_img.mode, (width, width), background_color)
        result.paste(pil_img, (0, (width - height) // 2))
        return result
    else:
        result = Image.new(pil_img.mode, (height, height), background_color)
        result.paste(pil_img, ((height - width) // 2, 0))
        return result


@torch.inference_mode()
@spaces.GPU(duration=120) 
# Multimodal Understanding function
def multimodal_understanding(image, question, seed, top_p, temperature, progress=gr.Progress(track_tqdm=True)):
    # Clear CUDA cache before generating
    torch.cuda.empty_cache()
    
    # set seed
    # torch.manual_seed(seed)
    # np.random.seed(seed)
    # torch.cuda.manual_seed(seed)
    print(torch.cuda.is_available())

    max_new_tokens = 512
    image_size = 512

    assert image_size == 512
    image = Image.fromarray(image).convert('RGB')
    image = expand2square(
        image, (127, 127, 127))
    image = image.resize(size=(image_size, image_size))
    image = torch.from_numpy(np.array(image)).to(dtype=harmon_model.dtype, device=harmon_model.device)
    image = rearrange(image, 'h w c -> c h w')[None]
    image = 2 * (image / 255) - 1

    prompt = PROMPT_TEMPLATE['INSTRUCTION'].format(input="<image>\n" + question)
    assert '<image>' in prompt
    image_length = (image_size // 16) ** 2 + harmon_model.mar.buffer_size
    prompt = prompt.replace('<image>', '<image>' * image_length)
    input_ids = harmon_tokenizer.encode(
        prompt, add_special_tokens=True, return_tensors='pt').to(harmon_model.device)
    _, z_enc = harmon_model.extract_visual_feature(harmon_model.encode(image))
    inputs_embeds = z_enc.new_zeros(*input_ids.shape, harmon_model.llm.config.hidden_size)
    inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1)
    inputs_embeds[input_ids != image_token_idx] = harmon_model.llm.get_input_embeddings()(
        input_ids[input_ids != image_token_idx]
    )
    output = harmon_model.llm.generate(inputs_embeds=inputs_embeds,
                                       eos_token_id=harmon_tokenizer.eos_token_id,
                                       pad_token_id=harmon_tokenizer.pad_token_id
                                       if harmon_tokenizer.pad_token_id is not None else
                                       harmon_tokenizer.eos_token_id,
                                       max_new_tokens=max_new_tokens,
                                       do_sample=False,  # if temperature == 0 else True,
                                       use_cache=True,
                                       # temperature=temperature,
                                       # top_p=top_p
                                       )

    return harmon_tokenizer.decode(output[0],  skip_special_tokens=True)


@torch.inference_mode()
@spaces.GPU(duration=120)  # Specify a duration to avoid timeout
def generate_image(prompt,
                   seed=42,
                   guidance=3,
                   t2i_temperature=1.0,
                   progress=gr.Progress(track_tqdm=True)):
    # Clear CUDA cache and avoid tracking gradients
    torch.cuda.empty_cache()
    # Set the seed for reproducible results
    # if seed is not None:
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    print(torch.cuda.is_available())

    negative_prompt = 'Generate an image.'
    prompt = GENERATION_TEMPLATE.format(text=prompt)
    repeat = 4
    num_steps = 64
    image_size = 512

    assert image_size == 512
    m = n = image_size // 16

    prompts = [PROMPT_TEMPLATE['INSTRUCTION'].format(input=prompt)] * repeat

    if guidance != 1.0:
        prompts += [PROMPT_TEMPLATE['INSTRUCTION'].format(input=negative_prompt)] * len(prompts)

    inputs = harmon_tokenizer(
        prompts, add_special_tokens=True, return_tensors='pt', padding=True).to(harmon_model.device)
    # import pdb; pdb.set_trace()
    with torch.no_grad():

        images = harmon_model.sample(**inputs, num_iter=num_steps, cfg=guidance, cfg_schedule="constant",
                                     temperature=t2i_temperature, progress=True, image_shape=(m, n))

        images = rearrange(images, 'b c h w -> b h w c')

        images = torch.clamp(
            127.5 * images + 128.0, 0, 255).to("cpu", dtype=torch.uint8).numpy()

        # ret_images = [image_upsample(Image.fromarray(image)) for image in images]
        ret_images = [Image.fromarray(image) for image in images]


        return ret_images



# Gradio interface
css = '''
.gradio-container {max-width: 960px !important}
'''
with gr.Blocks(css=css) as demo:
    gr.Markdown("# Harmon 1.5B")

    with gr.Tab("Text-to-Image Generation"):
        gr.Markdown(value="## Text-to-Image Generation")

        prompt_input = gr.Textbox(label="Prompt.")
    
        generation_button = gr.Button("Generate Images")
    
        image_output = gr.Gallery(label="Generated Images", columns=4, rows=1)

        with gr.Accordion("Advanced options", open=False):
            with gr.Row():
                cfg_weight_input = gr.Slider(minimum=1, maximum=10, value=5, step=0.5, label="CFG Weight")
                t2i_temperature = gr.Slider(minimum=0, maximum=1, value=1.0, step=0.05, label="temperature")
            seed_input = gr.Number(label="Seed (Optional)", precision=0, value=1234)
    
        examples_t2i = gr.Examples(
            label="Text to image generation examples.",
            examples=[
                "a dog on the left and a cat on the right.",
                "a photo of a pink stop sign.",
                "Paper artwork, layered paper, colorful Chinese dragon surrounded by clouds.",
                "a golden retriever lying peacefully on a wooden porch, with autumn leaves scattered around.",
            ],
            inputs=prompt_input,
        )

    with gr.Tab("Multimodal Understanding"):
        gr.Markdown(value="## Multimodal Understanding")
        image_input = gr.Image()
        with gr.Column():
            question_input = gr.Textbox(label="Question")

        understanding_button = gr.Button("Chat")
        understanding_output = gr.Textbox(label="Response")

        with gr.Accordion("Advanced options", open=False):
            und_seed_input = gr.Number(label="Seed", precision=0, value=42)
            top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")
            temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="temperature")

        examples_inpainting = gr.Examples(
            label="Multimodal Understanding examples",
            examples=[
                [
                    "Is the picture taken in winter?",
                    "view.jpg",
                ],
                [
                    "Briefly describe the image.",
                    "view.jpg",
                ],
            ],
            inputs=[question_input, image_input],
        )

    generation_button.click(
        fn=generate_image,
        inputs=[prompt_input, seed_input, cfg_weight_input, t2i_temperature],
        outputs=image_output
    )

    understanding_button.click(
        multimodal_understanding,
        inputs=[image_input, question_input, und_seed_input, top_p, temperature],
        outputs=understanding_output
    )

demo.launch(share=True)

# if __name__ == "__main__":
#     image = Image.open('view.jpg')
#     image = np.array(image)
#     print(image.shape)
#     # text = multimodal_understanding(image, question='Is the picture taken in winter?', seed=42, top_p=None, temperature=None)

#     # print(text)

#     res = generate_image('Paper artwork, layered paper, colorful Chinese dragon surrounded by clouds.')

#     for idx, img in enumerate(res):
#         img.save(f"{idx}.jpg")