|
import gradio as gr |
|
import torch |
|
from transformers import AutoConfig |
|
from transformers import AutoTokenizer, AutoModel |
|
from PIL import Image |
|
|
|
import numpy as np |
|
import spaces |
|
from einops import rearrange |
|
|
|
|
|
PROMPT_TEMPLATE = dict( |
|
SYSTEM='<|im_start|>system\n{system}<|im_end|>\n', |
|
INSTRUCTION='<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n', |
|
SUFFIX='<|im_end|>', |
|
SUFFIX_AS_EOS=True, |
|
SEP='\n', |
|
STOP_WORDS=['<|im_end|>', '<|endoftext|>']) |
|
|
|
GENERATION_TEMPLATE = "Generate an image: {text}" |
|
|
|
|
|
model_path = "wusize/Harmon-1_5B" |
|
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) |
|
llm_config = config.llm |
|
llm_config['_attn_implementation'] = 'eager' |
|
harmon_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) |
|
harmon_model = AutoModel.from_pretrained(model_path, llm=llm_config, |
|
trust_remote_code=True).eval() |
|
|
|
special_tokens_dict = {'additional_special_tokens': ["<image>", ]} |
|
num_added_toks = harmon_tokenizer.add_special_tokens(special_tokens_dict) |
|
assert num_added_toks == 1 |
|
|
|
image_token_idx = harmon_tokenizer.encode("<image>", add_special_tokens=False)[-1] |
|
print(f"Image token: {harmon_tokenizer.decode(image_token_idx)}", flush=True) |
|
|
|
if torch.cuda.is_available(): |
|
harmon_model = harmon_model.to(torch.bfloat16).cuda() |
|
else: |
|
harmon_model = harmon_model.to(torch.float32) |
|
|
|
|
|
def expand2square(pil_img, background_color): |
|
width, height = pil_img.size |
|
if width == height: |
|
return pil_img |
|
elif width > height: |
|
result = Image.new(pil_img.mode, (width, width), background_color) |
|
result.paste(pil_img, (0, (width - height) // 2)) |
|
return result |
|
else: |
|
result = Image.new(pil_img.mode, (height, height), background_color) |
|
result.paste(pil_img, ((height - width) // 2, 0)) |
|
return result |
|
|
|
|
|
@torch.inference_mode() |
|
@spaces.GPU(duration=120) |
|
|
|
def multimodal_understanding(image, question, seed, top_p, temperature, progress=gr.Progress(track_tqdm=True)): |
|
|
|
torch.cuda.empty_cache() |
|
|
|
|
|
|
|
|
|
|
|
print(torch.cuda.is_available()) |
|
|
|
max_new_tokens = 512 |
|
image_size = 512 |
|
|
|
assert image_size == 512 |
|
image = Image.fromarray(image).convert('RGB') |
|
image = expand2square( |
|
image, (127, 127, 127)) |
|
image = image.resize(size=(image_size, image_size)) |
|
image = torch.from_numpy(np.array(image)).to(dtype=harmon_model.dtype, device=harmon_model.device) |
|
image = rearrange(image, 'h w c -> c h w')[None] |
|
image = 2 * (image / 255) - 1 |
|
|
|
prompt = PROMPT_TEMPLATE['INSTRUCTION'].format(input="<image>\n" + question) |
|
assert '<image>' in prompt |
|
image_length = (image_size // 16) ** 2 + harmon_model.mar.buffer_size |
|
prompt = prompt.replace('<image>', '<image>' * image_length) |
|
input_ids = harmon_tokenizer.encode( |
|
prompt, add_special_tokens=True, return_tensors='pt').to(harmon_model.device) |
|
_, z_enc = harmon_model.extract_visual_feature(harmon_model.encode(image)) |
|
inputs_embeds = z_enc.new_zeros(*input_ids.shape, harmon_model.llm.config.hidden_size) |
|
inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1) |
|
inputs_embeds[input_ids != image_token_idx] = harmon_model.llm.get_input_embeddings()( |
|
input_ids[input_ids != image_token_idx] |
|
) |
|
output = harmon_model.llm.generate(inputs_embeds=inputs_embeds, |
|
eos_token_id=harmon_tokenizer.eos_token_id, |
|
pad_token_id=harmon_tokenizer.pad_token_id |
|
if harmon_tokenizer.pad_token_id is not None else |
|
harmon_tokenizer.eos_token_id, |
|
max_new_tokens=max_new_tokens, |
|
do_sample=False, |
|
use_cache=True, |
|
|
|
|
|
) |
|
|
|
return harmon_tokenizer.decode(output[0], skip_special_tokens=True) |
|
|
|
|
|
@torch.inference_mode() |
|
@spaces.GPU(duration=120) |
|
def generate_image(prompt, |
|
seed=42, |
|
guidance=3, |
|
t2i_temperature=1.0, |
|
progress=gr.Progress(track_tqdm=True)): |
|
|
|
torch.cuda.empty_cache() |
|
|
|
|
|
torch.manual_seed(seed) |
|
torch.cuda.manual_seed(seed) |
|
np.random.seed(seed) |
|
print(torch.cuda.is_available()) |
|
|
|
negative_prompt = 'Generate an image.' |
|
prompt = GENERATION_TEMPLATE.format(text=prompt) |
|
repeat = 4 |
|
num_steps = 64 |
|
image_size = 512 |
|
|
|
assert image_size == 512 |
|
m = n = image_size // 16 |
|
|
|
prompts = [PROMPT_TEMPLATE['INSTRUCTION'].format(input=prompt)] * repeat |
|
|
|
if guidance != 1.0: |
|
prompts += [PROMPT_TEMPLATE['INSTRUCTION'].format(input=negative_prompt)] * len(prompts) |
|
|
|
inputs = harmon_tokenizer( |
|
prompts, add_special_tokens=True, return_tensors='pt', padding=True).to(harmon_model.device) |
|
|
|
with torch.no_grad(): |
|
|
|
images = harmon_model.sample(**inputs, num_iter=num_steps, cfg=guidance, cfg_schedule="constant", |
|
temperature=t2i_temperature, progress=True, image_shape=(m, n)) |
|
|
|
images = rearrange(images, 'b c h w -> b h w c') |
|
|
|
images = torch.clamp( |
|
127.5 * images + 128.0, 0, 255).to("cpu", dtype=torch.uint8).numpy() |
|
|
|
|
|
ret_images = [Image.fromarray(image) for image in images] |
|
|
|
|
|
return ret_images |
|
|
|
|
|
|
|
|
|
css = ''' |
|
.gradio-container {max-width: 960px !important} |
|
''' |
|
with gr.Blocks(css=css) as demo: |
|
gr.Markdown("# Harmon 1.5B") |
|
|
|
with gr.Tab("Text-to-Image Generation"): |
|
gr.Markdown(value="## Text-to-Image Generation") |
|
|
|
prompt_input = gr.Textbox(label="Prompt.") |
|
|
|
generation_button = gr.Button("Generate Images") |
|
|
|
image_output = gr.Gallery(label="Generated Images", columns=4, rows=1) |
|
|
|
with gr.Accordion("Advanced options", open=False): |
|
with gr.Row(): |
|
cfg_weight_input = gr.Slider(minimum=1, maximum=10, value=5, step=0.5, label="CFG Weight") |
|
t2i_temperature = gr.Slider(minimum=0, maximum=1, value=1.0, step=0.05, label="temperature") |
|
seed_input = gr.Number(label="Seed (Optional)", precision=0, value=1234) |
|
|
|
examples_t2i = gr.Examples( |
|
label="Text to image generation examples.", |
|
examples=[ |
|
"a dog on the left and a cat on the right.", |
|
"a photo of a pink stop sign.", |
|
"Paper artwork, layered paper, colorful Chinese dragon surrounded by clouds.", |
|
"a golden retriever lying peacefully on a wooden porch, with autumn leaves scattered around.", |
|
], |
|
inputs=prompt_input, |
|
) |
|
|
|
with gr.Tab("Multimodal Understanding"): |
|
gr.Markdown(value="## Multimodal Understanding") |
|
image_input = gr.Image() |
|
with gr.Column(): |
|
question_input = gr.Textbox(label="Question") |
|
|
|
understanding_button = gr.Button("Chat") |
|
understanding_output = gr.Textbox(label="Response") |
|
|
|
with gr.Accordion("Advanced options", open=False): |
|
und_seed_input = gr.Number(label="Seed", precision=0, value=42) |
|
top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p") |
|
temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="temperature") |
|
|
|
examples_inpainting = gr.Examples( |
|
label="Multimodal Understanding examples", |
|
examples=[ |
|
[ |
|
"Is the picture taken in winter?", |
|
"view.jpg", |
|
], |
|
[ |
|
"Briefly describe the image.", |
|
"view.jpg", |
|
], |
|
], |
|
inputs=[question_input, image_input], |
|
) |
|
|
|
generation_button.click( |
|
fn=generate_image, |
|
inputs=[prompt_input, seed_input, cfg_weight_input, t2i_temperature], |
|
outputs=image_output |
|
) |
|
|
|
understanding_button.click( |
|
multimodal_understanding, |
|
inputs=[image_input, question_input, und_seed_input, top_p, temperature], |
|
outputs=understanding_output |
|
) |
|
|
|
demo.launch(share=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|