Harmon / app.py
wusize's picture
Update app.py
885d900 verified
import gradio as gr
import torch
from transformers import AutoConfig
from transformers import AutoTokenizer, AutoModel
from PIL import Image
import numpy as np
import spaces # Import spaces for ZeroGPU compatibility
from einops import rearrange
PROMPT_TEMPLATE = dict(
SYSTEM='<|im_start|>system\n{system}<|im_end|>\n',
INSTRUCTION='<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n',
SUFFIX='<|im_end|>',
SUFFIX_AS_EOS=True,
SEP='\n',
STOP_WORDS=['<|im_end|>', '<|endoftext|>'])
GENERATION_TEMPLATE = "Generate an image: {text}"
model_path = "wusize/Harmon-1_5B"
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
llm_config = config.llm
llm_config['_attn_implementation'] = 'eager'
harmon_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
harmon_model = AutoModel.from_pretrained(model_path, llm=llm_config,
trust_remote_code=True).eval()
special_tokens_dict = {'additional_special_tokens': ["<image>", ]}
num_added_toks = harmon_tokenizer.add_special_tokens(special_tokens_dict)
assert num_added_toks == 1
image_token_idx = harmon_tokenizer.encode("<image>", add_special_tokens=False)[-1]
print(f"Image token: {harmon_tokenizer.decode(image_token_idx)}", flush=True)
if torch.cuda.is_available():
harmon_model = harmon_model.to(torch.bfloat16).cuda()
else:
harmon_model = harmon_model.to(torch.float32)
def expand2square(pil_img, background_color):
width, height = pil_img.size
if width == height:
return pil_img
elif width > height:
result = Image.new(pil_img.mode, (width, width), background_color)
result.paste(pil_img, (0, (width - height) // 2))
return result
else:
result = Image.new(pil_img.mode, (height, height), background_color)
result.paste(pil_img, ((height - width) // 2, 0))
return result
@torch.inference_mode()
@spaces.GPU(duration=120)
# Multimodal Understanding function
def multimodal_understanding(image, question, seed, top_p, temperature, progress=gr.Progress(track_tqdm=True)):
# Clear CUDA cache before generating
torch.cuda.empty_cache()
# set seed
# torch.manual_seed(seed)
# np.random.seed(seed)
# torch.cuda.manual_seed(seed)
print(torch.cuda.is_available())
max_new_tokens = 512
image_size = 512
assert image_size == 512
image = Image.fromarray(image).convert('RGB')
image = expand2square(
image, (127, 127, 127))
image = image.resize(size=(image_size, image_size))
image = torch.from_numpy(np.array(image)).to(dtype=harmon_model.dtype, device=harmon_model.device)
image = rearrange(image, 'h w c -> c h w')[None]
image = 2 * (image / 255) - 1
prompt = PROMPT_TEMPLATE['INSTRUCTION'].format(input="<image>\n" + question)
assert '<image>' in prompt
image_length = (image_size // 16) ** 2 + harmon_model.mar.buffer_size
prompt = prompt.replace('<image>', '<image>' * image_length)
input_ids = harmon_tokenizer.encode(
prompt, add_special_tokens=True, return_tensors='pt').to(harmon_model.device)
_, z_enc = harmon_model.extract_visual_feature(harmon_model.encode(image))
inputs_embeds = z_enc.new_zeros(*input_ids.shape, harmon_model.llm.config.hidden_size)
inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1)
inputs_embeds[input_ids != image_token_idx] = harmon_model.llm.get_input_embeddings()(
input_ids[input_ids != image_token_idx]
)
output = harmon_model.llm.generate(inputs_embeds=inputs_embeds,
eos_token_id=harmon_tokenizer.eos_token_id,
pad_token_id=harmon_tokenizer.pad_token_id
if harmon_tokenizer.pad_token_id is not None else
harmon_tokenizer.eos_token_id,
max_new_tokens=max_new_tokens,
do_sample=False, # if temperature == 0 else True,
use_cache=True,
# temperature=temperature,
# top_p=top_p
)
return harmon_tokenizer.decode(output[0], skip_special_tokens=True)
@torch.inference_mode()
@spaces.GPU(duration=120) # Specify a duration to avoid timeout
def generate_image(prompt,
seed=42,
guidance=3,
t2i_temperature=1.0,
progress=gr.Progress(track_tqdm=True)):
# Clear CUDA cache and avoid tracking gradients
torch.cuda.empty_cache()
# Set the seed for reproducible results
# if seed is not None:
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
print(torch.cuda.is_available())
negative_prompt = 'Generate an image.'
prompt = GENERATION_TEMPLATE.format(text=prompt)
repeat = 4
num_steps = 64
image_size = 512
assert image_size == 512
m = n = image_size // 16
prompts = [PROMPT_TEMPLATE['INSTRUCTION'].format(input=prompt)] * repeat
if guidance != 1.0:
prompts += [PROMPT_TEMPLATE['INSTRUCTION'].format(input=negative_prompt)] * len(prompts)
inputs = harmon_tokenizer(
prompts, add_special_tokens=True, return_tensors='pt', padding=True).to(harmon_model.device)
# import pdb; pdb.set_trace()
with torch.no_grad():
images = harmon_model.sample(**inputs, num_iter=num_steps, cfg=guidance, cfg_schedule="constant",
temperature=t2i_temperature, progress=True, image_shape=(m, n))
images = rearrange(images, 'b c h w -> b h w c')
images = torch.clamp(
127.5 * images + 128.0, 0, 255).to("cpu", dtype=torch.uint8).numpy()
# ret_images = [image_upsample(Image.fromarray(image)) for image in images]
ret_images = [Image.fromarray(image) for image in images]
return ret_images
# Gradio interface
css = '''
.gradio-container {max-width: 960px !important}
'''
with gr.Blocks(css=css) as demo:
gr.Markdown("# Harmon 1.5B")
with gr.Tab("Text-to-Image Generation"):
gr.Markdown(value="## Text-to-Image Generation")
prompt_input = gr.Textbox(label="Prompt.")
generation_button = gr.Button("Generate Images")
image_output = gr.Gallery(label="Generated Images", columns=4, rows=1)
with gr.Accordion("Advanced options", open=False):
with gr.Row():
cfg_weight_input = gr.Slider(minimum=1, maximum=10, value=5, step=0.5, label="CFG Weight")
t2i_temperature = gr.Slider(minimum=0, maximum=1, value=1.0, step=0.05, label="temperature")
seed_input = gr.Number(label="Seed (Optional)", precision=0, value=1234)
examples_t2i = gr.Examples(
label="Text to image generation examples.",
examples=[
"a dog on the left and a cat on the right.",
"a photo of a pink stop sign.",
"Paper artwork, layered paper, colorful Chinese dragon surrounded by clouds.",
"a golden retriever lying peacefully on a wooden porch, with autumn leaves scattered around.",
],
inputs=prompt_input,
)
with gr.Tab("Multimodal Understanding"):
gr.Markdown(value="## Multimodal Understanding")
image_input = gr.Image()
with gr.Column():
question_input = gr.Textbox(label="Question")
understanding_button = gr.Button("Chat")
understanding_output = gr.Textbox(label="Response")
with gr.Accordion("Advanced options", open=False):
und_seed_input = gr.Number(label="Seed", precision=0, value=42)
top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")
temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="temperature")
examples_inpainting = gr.Examples(
label="Multimodal Understanding examples",
examples=[
[
"Is the picture taken in winter?",
"view.jpg",
],
[
"Briefly describe the image.",
"view.jpg",
],
],
inputs=[question_input, image_input],
)
generation_button.click(
fn=generate_image,
inputs=[prompt_input, seed_input, cfg_weight_input, t2i_temperature],
outputs=image_output
)
understanding_button.click(
multimodal_understanding,
inputs=[image_input, question_input, und_seed_input, top_p, temperature],
outputs=understanding_output
)
demo.launch(share=True)
# if __name__ == "__main__":
# image = Image.open('view.jpg')
# image = np.array(image)
# print(image.shape)
# # text = multimodal_understanding(image, question='Is the picture taken in winter?', seed=42, top_p=None, temperature=None)
# # print(text)
# res = generate_image('Paper artwork, layered paper, colorful Chinese dragon surrounded by clouds.')
# for idx, img in enumerate(res):
# img.save(f"{idx}.jpg")