Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
from transformers import AutoConfig, AutoModelForCausalLM
|
|
|
4 |
from janus.models import MultiModalityCausalLM, VLChatProcessor
|
5 |
from janus.utils.io import load_pil_images
|
6 |
from PIL import Image
|
@@ -10,6 +11,7 @@ import os
|
|
10 |
import time
|
11 |
from Upsample import RealESRGAN
|
12 |
import spaces # Import spaces for ZeroGPU compatibility
|
|
|
13 |
|
14 |
|
15 |
# Load model and processor
|
@@ -33,6 +35,55 @@ cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
33 |
sr_model = RealESRGAN(torch.device('cuda' if torch.cuda.is_available() else 'cpu'), scale=2)
|
34 |
sr_model.load_weights(f'weights/RealESRGAN_x2.pth', download=False)
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
@torch.inference_mode()
|
37 |
@spaces.GPU(duration=120)
|
38 |
# Multimodal Understanding function
|
@@ -44,39 +95,45 @@ def multimodal_understanding(image, question, seed, top_p, temperature, progress
|
|
44 |
torch.manual_seed(seed)
|
45 |
np.random.seed(seed)
|
46 |
torch.cuda.manual_seed(seed)
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
max_new_tokens=512,
|
72 |
-
do_sample=False if temperature == 0 else True,
|
73 |
-
use_cache=True,
|
74 |
-
temperature=temperature,
|
75 |
-
top_p=top_p,
|
76 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
-
answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
|
79 |
-
return answer
|
80 |
|
81 |
|
82 |
def generate(input_ids,
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
from transformers import AutoConfig, AutoModelForCausalLM
|
4 |
+
from transformers import AutoTokenizer, AutoModel
|
5 |
from janus.models import MultiModalityCausalLM, VLChatProcessor
|
6 |
from janus.utils.io import load_pil_images
|
7 |
from PIL import Image
|
|
|
11 |
import time
|
12 |
from Upsample import RealESRGAN
|
13 |
import spaces # Import spaces for ZeroGPU compatibility
|
14 |
+
from einops import rearrange
|
15 |
|
16 |
|
17 |
# Load model and processor
|
|
|
35 |
sr_model = RealESRGAN(torch.device('cuda' if torch.cuda.is_available() else 'cpu'), scale=2)
|
36 |
sr_model.load_weights(f'weights/RealESRGAN_x2.pth', download=False)
|
37 |
|
38 |
+
|
39 |
+
|
40 |
+
PROMPT_TEMPLATE = dict(
|
41 |
+
SYSTEM='<|im_start|>system\n{system}<|im_end|>\n',
|
42 |
+
INSTRUCTION='<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n',
|
43 |
+
SUFFIX='<|im_end|>',
|
44 |
+
SUFFIX_AS_EOS=True,
|
45 |
+
SEP='\n',
|
46 |
+
STOP_WORDS=['<|im_end|>', '<|endoftext|>'])
|
47 |
+
|
48 |
+
GENERATION_TEMPLATE = "Generate an image: {text}"
|
49 |
+
|
50 |
+
|
51 |
+
model_path = "wusize/Harmon-1_5B"
|
52 |
+
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
53 |
+
llm_config = config.llm
|
54 |
+
llm_config['_attn_implementation'] = 'eager'
|
55 |
+
harmon_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
56 |
+
model = AutoModel.from_pretrained(model_path, llm=llm_config,
|
57 |
+
trust_remote_code=True).eval()
|
58 |
+
|
59 |
+
special_tokens_dict = {'additional_special_tokens': ["<image>", ]}
|
60 |
+
num_added_toks = harmon_tokenizer.add_special_tokens(special_tokens_dict)
|
61 |
+
assert num_added_toks == 1
|
62 |
+
|
63 |
+
image_token_idx = harmon_tokenizer.encode("<image>", add_special_tokens=False)[-1]
|
64 |
+
print(f"Image token: {harmon_tokenizer.decode(image_token_idx)}", flush=True)
|
65 |
+
|
66 |
+
if torch.cuda.is_available():
|
67 |
+
model = model.to(torch.bfloat16).cuda()
|
68 |
+
else:
|
69 |
+
model = model.to(torch.float16)
|
70 |
+
|
71 |
+
|
72 |
+
|
73 |
+
def expand2square(pil_img, background_color):
|
74 |
+
width, height = pil_img.size
|
75 |
+
if width == height:
|
76 |
+
return pil_img
|
77 |
+
elif width > height:
|
78 |
+
result = Image.new(pil_img.mode, (width, width), background_color)
|
79 |
+
result.paste(pil_img, (0, (width - height) // 2))
|
80 |
+
return result
|
81 |
+
else:
|
82 |
+
result = Image.new(pil_img.mode, (height, height), background_color)
|
83 |
+
result.paste(pil_img, ((height - width) // 2, 0))
|
84 |
+
return result
|
85 |
+
|
86 |
+
|
87 |
@torch.inference_mode()
|
88 |
@spaces.GPU(duration=120)
|
89 |
# Multimodal Understanding function
|
|
|
95 |
torch.manual_seed(seed)
|
96 |
np.random.seed(seed)
|
97 |
torch.cuda.manual_seed(seed)
|
98 |
+
|
99 |
+
max_new_tokens = 512
|
100 |
+
image_size = 512
|
101 |
+
|
102 |
+
assert image_size == 512
|
103 |
+
image = Image.fromarray(image).convert('RGB')
|
104 |
+
image = expand2square(
|
105 |
+
image, (127, 127, 127))
|
106 |
+
image = image.resize(size=(image_size, image_size))
|
107 |
+
image = torch.from_numpy(np.array(image)).to(dtype=model.dtype, device=cuda_device)
|
108 |
+
image = rearrange(image, 'h w c -> c h w')[None]
|
109 |
+
image = 2 * (image / 255) - 1
|
110 |
+
|
111 |
+
prompt = PROMPT_TEMPLATE['INSTRUCTION'].format(input="<image>\n" + question)
|
112 |
+
assert '<image>' in prompt
|
113 |
+
image_length = (image_size // 16) ** 2 + model.mar.buffer_size
|
114 |
+
prompt = prompt.replace('<image>', '<image>' * image_length)
|
115 |
+
input_ids = harmon_tokenizer.encode(
|
116 |
+
prompt, add_special_tokens=True, return_tensors='pt').to(cuda_device)
|
117 |
+
_, z_enc = model.extract_visual_feature(model.encode(image))
|
118 |
+
inputs_embeds = z_enc.new_zeros(*input_ids.shape, model.llm.config.hidden_size)
|
119 |
+
inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1)
|
120 |
+
inputs_embeds[input_ids != image_token_idx] = model.llm.get_input_embeddings()(
|
121 |
+
input_ids[input_ids != image_token_idx]
|
|
|
|
|
|
|
|
|
|
|
122 |
)
|
123 |
+
output = model.llm.generate(inputs_embeds=inputs_embeds,
|
124 |
+
eos_token_id=harmon_tokenizer.eos_token_id,
|
125 |
+
pad_token_id=harmon_tokenizer.pad_token_id
|
126 |
+
if harmon_tokenizer.pad_token_id is not None else
|
127 |
+
harmon_tokenizer.eos_token_id,
|
128 |
+
max_new_tokens=max_new_tokens,
|
129 |
+
do_sample=False if temperature == 0 else True,
|
130 |
+
use_cache=True,
|
131 |
+
temperature=temperature,
|
132 |
+
top_p=top_p,
|
133 |
+
)
|
134 |
+
return harmon_tokenizer.decode(output[0], skip_special_tokens=True)
|
135 |
+
|
136 |
|
|
|
|
|
137 |
|
138 |
|
139 |
def generate(input_ids,
|