import gradio as gr import torch from transformers import AutoConfig from transformers import AutoTokenizer, AutoModel from PIL import Image import numpy as np import spaces # Import spaces for ZeroGPU compatibility from einops import rearrange PROMPT_TEMPLATE = dict( SYSTEM='<|im_start|>system\n{system}<|im_end|>\n', INSTRUCTION='<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n', SUFFIX='<|im_end|>', SUFFIX_AS_EOS=True, SEP='\n', STOP_WORDS=['<|im_end|>', '<|endoftext|>']) GENERATION_TEMPLATE = "Generate an image: {text}" model_path = "wusize/Harmon-1_5B" config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) llm_config = config.llm llm_config['_attn_implementation'] = 'eager' harmon_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) harmon_model = AutoModel.from_pretrained(model_path, llm=llm_config, trust_remote_code=True).eval() special_tokens_dict = {'additional_special_tokens': ["", ]} num_added_toks = harmon_tokenizer.add_special_tokens(special_tokens_dict) assert num_added_toks == 1 image_token_idx = harmon_tokenizer.encode("", add_special_tokens=False)[-1] print(f"Image token: {harmon_tokenizer.decode(image_token_idx)}", flush=True) if torch.cuda.is_available(): harmon_model = harmon_model.to(torch.bfloat16).cuda() else: harmon_model = harmon_model.to(torch.float32) def expand2square(pil_img, background_color): width, height = pil_img.size if width == height: return pil_img elif width > height: result = Image.new(pil_img.mode, (width, width), background_color) result.paste(pil_img, (0, (width - height) // 2)) return result else: result = Image.new(pil_img.mode, (height, height), background_color) result.paste(pil_img, ((height - width) // 2, 0)) return result @torch.inference_mode() @spaces.GPU(duration=120) # Multimodal Understanding function def multimodal_understanding(image, question, seed, top_p, temperature, progress=gr.Progress(track_tqdm=True)): # Clear CUDA cache before generating torch.cuda.empty_cache() # set seed # torch.manual_seed(seed) # np.random.seed(seed) # torch.cuda.manual_seed(seed) print(torch.cuda.is_available()) max_new_tokens = 512 image_size = 512 assert image_size == 512 image = Image.fromarray(image).convert('RGB') image = expand2square( image, (127, 127, 127)) image = image.resize(size=(image_size, image_size)) image = torch.from_numpy(np.array(image)).to(dtype=harmon_model.dtype, device=harmon_model.device) image = rearrange(image, 'h w c -> c h w')[None] image = 2 * (image / 255) - 1 prompt = PROMPT_TEMPLATE['INSTRUCTION'].format(input="\n" + question) assert '' in prompt image_length = (image_size // 16) ** 2 + harmon_model.mar.buffer_size prompt = prompt.replace('', '' * image_length) input_ids = harmon_tokenizer.encode( prompt, add_special_tokens=True, return_tensors='pt').to(harmon_model.device) _, z_enc = harmon_model.extract_visual_feature(harmon_model.encode(image)) inputs_embeds = z_enc.new_zeros(*input_ids.shape, harmon_model.llm.config.hidden_size) inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1) inputs_embeds[input_ids != image_token_idx] = harmon_model.llm.get_input_embeddings()( input_ids[input_ids != image_token_idx] ) output = harmon_model.llm.generate(inputs_embeds=inputs_embeds, eos_token_id=harmon_tokenizer.eos_token_id, pad_token_id=harmon_tokenizer.pad_token_id if harmon_tokenizer.pad_token_id is not None else harmon_tokenizer.eos_token_id, max_new_tokens=max_new_tokens, do_sample=False, # if temperature == 0 else True, use_cache=True, # temperature=temperature, # top_p=top_p ) return harmon_tokenizer.decode(output[0], skip_special_tokens=True) @torch.inference_mode() @spaces.GPU(duration=120) # Specify a duration to avoid timeout def generate_image(prompt, seed=42, guidance=3, t2i_temperature=1.0, progress=gr.Progress(track_tqdm=True)): # Clear CUDA cache and avoid tracking gradients torch.cuda.empty_cache() # Set the seed for reproducible results # if seed is not None: torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed) print(torch.cuda.is_available()) negative_prompt = 'Generate an image.' prompt = GENERATION_TEMPLATE.format(text=prompt) repeat = 4 num_steps = 64 image_size = 512 assert image_size == 512 m = n = image_size // 16 prompts = [PROMPT_TEMPLATE['INSTRUCTION'].format(input=prompt)] * repeat if guidance != 1.0: prompts += [PROMPT_TEMPLATE['INSTRUCTION'].format(input=negative_prompt)] * len(prompts) inputs = harmon_tokenizer( prompts, add_special_tokens=True, return_tensors='pt', padding=True).to(harmon_model.device) # import pdb; pdb.set_trace() with torch.no_grad(): images = harmon_model.sample(**inputs, num_iter=num_steps, cfg=guidance, cfg_schedule="constant", temperature=t2i_temperature, progress=True, image_shape=(m, n)) images = rearrange(images, 'b c h w -> b h w c') images = torch.clamp( 127.5 * images + 128.0, 0, 255).to("cpu", dtype=torch.uint8).numpy() # ret_images = [image_upsample(Image.fromarray(image)) for image in images] ret_images = [Image.fromarray(image) for image in images] return ret_images # Gradio interface css = ''' .gradio-container {max-width: 960px !important} ''' with gr.Blocks(css=css) as demo: gr.Markdown("# Harmon 1.5B") with gr.Tab("Text-to-Image Generation"): gr.Markdown(value="## Text-to-Image Generation") prompt_input = gr.Textbox(label="Prompt.") generation_button = gr.Button("Generate Images") image_output = gr.Gallery(label="Generated Images", columns=4, rows=1) with gr.Accordion("Advanced options", open=False): with gr.Row(): cfg_weight_input = gr.Slider(minimum=1, maximum=10, value=5, step=0.5, label="CFG Weight") t2i_temperature = gr.Slider(minimum=0, maximum=1, value=1.0, step=0.05, label="temperature") seed_input = gr.Number(label="Seed (Optional)", precision=0, value=1234) examples_t2i = gr.Examples( label="Text to image generation examples.", examples=[ "a dog on the left and a cat on the right.", "a photo of a pink stop sign.", "Paper artwork, layered paper, colorful Chinese dragon surrounded by clouds.", "a golden retriever lying peacefully on a wooden porch, with autumn leaves scattered around.", ], inputs=prompt_input, ) with gr.Tab("Multimodal Understanding"): gr.Markdown(value="## Multimodal Understanding") image_input = gr.Image() with gr.Column(): question_input = gr.Textbox(label="Question") understanding_button = gr.Button("Chat") understanding_output = gr.Textbox(label="Response") with gr.Accordion("Advanced options", open=False): und_seed_input = gr.Number(label="Seed", precision=0, value=42) top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p") temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="temperature") examples_inpainting = gr.Examples( label="Multimodal Understanding examples", examples=[ [ "Is the picture taken in winter?", "view.jpg", ], [ "Briefly describe the image.", "view.jpg", ], ], inputs=[question_input, image_input], ) generation_button.click( fn=generate_image, inputs=[prompt_input, seed_input, cfg_weight_input, t2i_temperature], outputs=image_output ) understanding_button.click( multimodal_understanding, inputs=[image_input, question_input, und_seed_input, top_p, temperature], outputs=understanding_output ) demo.launch(share=True) # if __name__ == "__main__": # image = Image.open('view.jpg') # image = np.array(image) # print(image.shape) # # text = multimodal_understanding(image, question='Is the picture taken in winter?', seed=42, top_p=None, temperature=None) # # print(text) # res = generate_image('Paper artwork, layered paper, colorful Chinese dragon surrounded by clouds.') # for idx, img in enumerate(res): # img.save(f"{idx}.jpg")