Spaces:

wusize
/

Harmon

Running on Zero

App Files Files Community

wusize commited on Mar 31

Commit

cb33705

verified ·

1 Parent(s): 68cb43d

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +88 -31

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 import torch
 from transformers import AutoConfig, AutoModelForCausalLM
 from janus.models import MultiModalityCausalLM, VLChatProcessor
 from janus.utils.io import load_pil_images
 from PIL import Image
@@ -10,6 +11,7 @@ import os
 import time
 from Upsample import RealESRGAN
 import spaces  # Import spaces for ZeroGPU compatibility
 # Load model and processor
@@ -33,6 +35,55 @@ cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu'
 sr_model = RealESRGAN(torch.device('cuda' if torch.cuda.is_available() else 'cpu'), scale=2)
 sr_model.load_weights(f'weights/RealESRGAN_x2.pth', download=False)
 @torch.inference_mode()
 @spaces.GPU(duration=120)
 # Multimodal Understanding function
@@ -44,39 +95,45 @@ def multimodal_understanding(image, question, seed, top_p, temperature, progress
     torch.manual_seed(seed)
     np.random.seed(seed)
     torch.cuda.manual_seed(seed)
-    conversation = [
-        {
-            "role": "<|User|>",
-            "content": f"<image_placeholder>\n{question}",
-            "images": [image],
-        },
-        {"role": "<|Assistant|>", "content": ""},
-    ]
-    pil_images = [Image.fromarray(image)]
-    prepare_inputs = vl_chat_processor(
-        conversations=conversation, images=pil_images, force_batchify=True
-    ).to(cuda_device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16)
-    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
-    outputs = vl_gpt.language_model.generate(
-        inputs_embeds=inputs_embeds,
-        attention_mask=prepare_inputs.attention_mask,
-        pad_token_id=tokenizer.eos_token_id,
-        bos_token_id=tokenizer.bos_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        max_new_tokens=512,
-        do_sample=False if temperature == 0 else True,
-        use_cache=True,
-        temperature=temperature,
-        top_p=top_p,
     )
-    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
-    return answer
 def generate(input_ids,

 import gradio as gr
 import torch
 from transformers import AutoConfig, AutoModelForCausalLM
+from transformers import AutoTokenizer, AutoModel
 from janus.models import MultiModalityCausalLM, VLChatProcessor
 from janus.utils.io import load_pil_images
 from PIL import Image
 import time
 from Upsample import RealESRGAN
 import spaces  # Import spaces for ZeroGPU compatibility
+from einops import rearrange
 # Load model and processor
 sr_model = RealESRGAN(torch.device('cuda' if torch.cuda.is_available() else 'cpu'), scale=2)
 sr_model.load_weights(f'weights/RealESRGAN_x2.pth', download=False)
+PROMPT_TEMPLATE = dict(
+    SYSTEM='<|im_start|>system\n{system}<|im_end|>\n',
+    INSTRUCTION='<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n',
+    SUFFIX='<|im_end|>',
+    SUFFIX_AS_EOS=True,
+    SEP='\n',
+    STOP_WORDS=['<|im_end|>', '<|endoftext|>'])
+GENERATION_TEMPLATE = "Generate an image: {text}"
+model_path = "wusize/Harmon-1_5B"
+config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+llm_config = config.llm
+llm_config['_attn_implementation'] = 'eager'
+harmon_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+model = AutoModel.from_pretrained(model_path, llm=llm_config,
+                                         trust_remote_code=True).eval()
+special_tokens_dict = {'additional_special_tokens': ["<image>", ]}
+num_added_toks = harmon_tokenizer.add_special_tokens(special_tokens_dict)
+assert num_added_toks == 1
+image_token_idx = harmon_tokenizer.encode("<image>", add_special_tokens=False)[-1]
+print(f"Image token: {harmon_tokenizer.decode(image_token_idx)}", flush=True)
+if torch.cuda.is_available():
+    model = model.to(torch.bfloat16).cuda()
+else:
+    model = model.to(torch.float16)
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
 @torch.inference_mode()
 @spaces.GPU(duration=120)
 # Multimodal Understanding function
     torch.manual_seed(seed)
     np.random.seed(seed)
     torch.cuda.manual_seed(seed)
+    max_new_tokens = 512
+    image_size = 512
+    assert image_size == 512
+    image = Image.fromarray(image).convert('RGB')
+    image = expand2square(
+        image, (127, 127, 127))
+    image = image.resize(size=(image_size, image_size))
+    image = torch.from_numpy(np.array(image)).to(dtype=model.dtype, device=cuda_device)
+    image = rearrange(image, 'h w c -> c h w')[None]
+    image = 2 * (image / 255) - 1
+    prompt = PROMPT_TEMPLATE['INSTRUCTION'].format(input="<image>\n" + question)
+    assert '<image>' in prompt
+    image_length = (image_size // 16) ** 2 + model.mar.buffer_size
+    prompt = prompt.replace('<image>', '<image>' * image_length)
+    input_ids = harmon_tokenizer.encode(
+        prompt, add_special_tokens=True, return_tensors='pt').to(cuda_device)
+    _, z_enc = model.extract_visual_feature(model.encode(image))
+    inputs_embeds = z_enc.new_zeros(*input_ids.shape, model.llm.config.hidden_size)
+    inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1)
+    inputs_embeds[input_ids != image_token_idx] = model.llm.get_input_embeddings()(
+        input_ids[input_ids != image_token_idx]
     )
+    output = model.llm.generate(inputs_embeds=inputs_embeds,
+                                eos_token_id=harmon_tokenizer.eos_token_id,
+                                pad_token_id=harmon_tokenizer.pad_token_id
+                                if harmon_tokenizer.pad_token_id is not None else
+                                harmon_tokenizer.eos_token_id,
+                                max_new_tokens=max_new_tokens,
+                                do_sample=False if temperature == 0 else True,
+                                use_cache=True,
+                                temperature=temperature,
+                                top_p=top_p,
+                                )
+    return harmon_tokenizer.decode(output[0],  skip_special_tokens=True)
 def generate(input_ids,