wusize commited on
Commit
cb33705
·
verified ·
1 Parent(s): 68cb43d

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +88 -31
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  import torch
3
  from transformers import AutoConfig, AutoModelForCausalLM
 
4
  from janus.models import MultiModalityCausalLM, VLChatProcessor
5
  from janus.utils.io import load_pil_images
6
  from PIL import Image
@@ -10,6 +11,7 @@ import os
10
  import time
11
  from Upsample import RealESRGAN
12
  import spaces # Import spaces for ZeroGPU compatibility
 
13
 
14
 
15
  # Load model and processor
@@ -33,6 +35,55 @@ cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu'
33
  sr_model = RealESRGAN(torch.device('cuda' if torch.cuda.is_available() else 'cpu'), scale=2)
34
  sr_model.load_weights(f'weights/RealESRGAN_x2.pth', download=False)
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  @torch.inference_mode()
37
  @spaces.GPU(duration=120)
38
  # Multimodal Understanding function
@@ -44,39 +95,45 @@ def multimodal_understanding(image, question, seed, top_p, temperature, progress
44
  torch.manual_seed(seed)
45
  np.random.seed(seed)
46
  torch.cuda.manual_seed(seed)
47
-
48
- conversation = [
49
- {
50
- "role": "<|User|>",
51
- "content": f"<image_placeholder>\n{question}",
52
- "images": [image],
53
- },
54
- {"role": "<|Assistant|>", "content": ""},
55
- ]
56
-
57
- pil_images = [Image.fromarray(image)]
58
- prepare_inputs = vl_chat_processor(
59
- conversations=conversation, images=pil_images, force_batchify=True
60
- ).to(cuda_device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16)
61
-
62
-
63
- inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
64
-
65
- outputs = vl_gpt.language_model.generate(
66
- inputs_embeds=inputs_embeds,
67
- attention_mask=prepare_inputs.attention_mask,
68
- pad_token_id=tokenizer.eos_token_id,
69
- bos_token_id=tokenizer.bos_token_id,
70
- eos_token_id=tokenizer.eos_token_id,
71
- max_new_tokens=512,
72
- do_sample=False if temperature == 0 else True,
73
- use_cache=True,
74
- temperature=temperature,
75
- top_p=top_p,
76
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
79
- return answer
80
 
81
 
82
  def generate(input_ids,
 
1
  import gradio as gr
2
  import torch
3
  from transformers import AutoConfig, AutoModelForCausalLM
4
+ from transformers import AutoTokenizer, AutoModel
5
  from janus.models import MultiModalityCausalLM, VLChatProcessor
6
  from janus.utils.io import load_pil_images
7
  from PIL import Image
 
11
  import time
12
  from Upsample import RealESRGAN
13
  import spaces # Import spaces for ZeroGPU compatibility
14
+ from einops import rearrange
15
 
16
 
17
  # Load model and processor
 
35
  sr_model = RealESRGAN(torch.device('cuda' if torch.cuda.is_available() else 'cpu'), scale=2)
36
  sr_model.load_weights(f'weights/RealESRGAN_x2.pth', download=False)
37
 
38
+
39
+
40
+ PROMPT_TEMPLATE = dict(
41
+ SYSTEM='<|im_start|>system\n{system}<|im_end|>\n',
42
+ INSTRUCTION='<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n',
43
+ SUFFIX='<|im_end|>',
44
+ SUFFIX_AS_EOS=True,
45
+ SEP='\n',
46
+ STOP_WORDS=['<|im_end|>', '<|endoftext|>'])
47
+
48
+ GENERATION_TEMPLATE = "Generate an image: {text}"
49
+
50
+
51
+ model_path = "wusize/Harmon-1_5B"
52
+ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
53
+ llm_config = config.llm
54
+ llm_config['_attn_implementation'] = 'eager'
55
+ harmon_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
56
+ model = AutoModel.from_pretrained(model_path, llm=llm_config,
57
+ trust_remote_code=True).eval()
58
+
59
+ special_tokens_dict = {'additional_special_tokens': ["<image>", ]}
60
+ num_added_toks = harmon_tokenizer.add_special_tokens(special_tokens_dict)
61
+ assert num_added_toks == 1
62
+
63
+ image_token_idx = harmon_tokenizer.encode("<image>", add_special_tokens=False)[-1]
64
+ print(f"Image token: {harmon_tokenizer.decode(image_token_idx)}", flush=True)
65
+
66
+ if torch.cuda.is_available():
67
+ model = model.to(torch.bfloat16).cuda()
68
+ else:
69
+ model = model.to(torch.float16)
70
+
71
+
72
+
73
+ def expand2square(pil_img, background_color):
74
+ width, height = pil_img.size
75
+ if width == height:
76
+ return pil_img
77
+ elif width > height:
78
+ result = Image.new(pil_img.mode, (width, width), background_color)
79
+ result.paste(pil_img, (0, (width - height) // 2))
80
+ return result
81
+ else:
82
+ result = Image.new(pil_img.mode, (height, height), background_color)
83
+ result.paste(pil_img, ((height - width) // 2, 0))
84
+ return result
85
+
86
+
87
  @torch.inference_mode()
88
  @spaces.GPU(duration=120)
89
  # Multimodal Understanding function
 
95
  torch.manual_seed(seed)
96
  np.random.seed(seed)
97
  torch.cuda.manual_seed(seed)
98
+
99
+ max_new_tokens = 512
100
+ image_size = 512
101
+
102
+ assert image_size == 512
103
+ image = Image.fromarray(image).convert('RGB')
104
+ image = expand2square(
105
+ image, (127, 127, 127))
106
+ image = image.resize(size=(image_size, image_size))
107
+ image = torch.from_numpy(np.array(image)).to(dtype=model.dtype, device=cuda_device)
108
+ image = rearrange(image, 'h w c -> c h w')[None]
109
+ image = 2 * (image / 255) - 1
110
+
111
+ prompt = PROMPT_TEMPLATE['INSTRUCTION'].format(input="<image>\n" + question)
112
+ assert '<image>' in prompt
113
+ image_length = (image_size // 16) ** 2 + model.mar.buffer_size
114
+ prompt = prompt.replace('<image>', '<image>' * image_length)
115
+ input_ids = harmon_tokenizer.encode(
116
+ prompt, add_special_tokens=True, return_tensors='pt').to(cuda_device)
117
+ _, z_enc = model.extract_visual_feature(model.encode(image))
118
+ inputs_embeds = z_enc.new_zeros(*input_ids.shape, model.llm.config.hidden_size)
119
+ inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1)
120
+ inputs_embeds[input_ids != image_token_idx] = model.llm.get_input_embeddings()(
121
+ input_ids[input_ids != image_token_idx]
 
 
 
 
 
122
  )
123
+ output = model.llm.generate(inputs_embeds=inputs_embeds,
124
+ eos_token_id=harmon_tokenizer.eos_token_id,
125
+ pad_token_id=harmon_tokenizer.pad_token_id
126
+ if harmon_tokenizer.pad_token_id is not None else
127
+ harmon_tokenizer.eos_token_id,
128
+ max_new_tokens=max_new_tokens,
129
+ do_sample=False if temperature == 0 else True,
130
+ use_cache=True,
131
+ temperature=temperature,
132
+ top_p=top_p,
133
+ )
134
+ return harmon_tokenizer.decode(output[0], skip_special_tokens=True)
135
+
136
 
 
 
137
 
138
 
139
  def generate(input_ids,