AustingDong
commited on
Commit
·
a594e78
1
Parent(s):
8bfad75
extend
Browse files- app.py +17 -12
- demo/cam.py +5 -4
- demo/model_utils.py +24 -14
app.py
CHANGED
@@ -110,12 +110,12 @@ def multimodal_understanding(model_type,
|
|
110 |
|
111 |
input_ids = prepare_inputs.input_ids[0].cpu().tolist()
|
112 |
input_ids_decoded = [tokenizer.decode([input_ids[i]]) for i in range(len(input_ids))]
|
113 |
-
if model_name.split('-')[0] == "Janus":
|
114 |
-
|
115 |
-
elif model_name.split('-')[0] == "ChartGemma":
|
116 |
-
|
117 |
-
elif model_name.split('-')[0] == "LLaVA":
|
118 |
-
|
119 |
|
120 |
if activation_map_method == "GradCAM":
|
121 |
# target_layers = vl_gpt.vision_model.vision_tower.blocks
|
@@ -136,7 +136,11 @@ def multimodal_understanding(model_type,
|
|
136 |
elif model_name.split('-')[0] == "ChartGemma":
|
137 |
gradcam = AttentionGuidedCAMChartGemma(vl_gpt, target_layers)
|
138 |
|
139 |
-
|
|
|
|
|
|
|
|
|
140 |
gradcam.remove_hooks()
|
141 |
|
142 |
|
@@ -207,14 +211,15 @@ def model_slider_change(model_type):
|
|
207 |
clean()
|
208 |
set_seed()
|
209 |
model_utils = LLaVA_Utils()
|
210 |
-
|
211 |
-
|
|
|
212 |
language_model_best_layer = 10
|
213 |
|
214 |
res = (
|
215 |
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="answer + visualization", label="response_type"),
|
216 |
-
gr.Slider(minimum=1, maximum=
|
217 |
-
gr.Slider(minimum=1, maximum=
|
218 |
gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
|
219 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
|
220 |
)
|
@@ -286,7 +291,7 @@ with gr.Blocks() as demo:
|
|
286 |
activation_map_output = gr.Gallery(label="activation Map", height=300, columns=1)
|
287 |
|
288 |
with gr.Column():
|
289 |
-
model_selector = gr.Dropdown(choices=["Clip", "ChartGemma-3B", "Janus-1B", "Janus-7B", "LLaVA-1.5-7B"], value="Clip", label="model")
|
290 |
response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
|
291 |
focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
|
292 |
activation_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
|
|
|
110 |
|
111 |
input_ids = prepare_inputs.input_ids[0].cpu().tolist()
|
112 |
input_ids_decoded = [tokenizer.decode([input_ids[i]]) for i in range(len(input_ids))]
|
113 |
+
# if model_name.split('-')[0] == "Janus":
|
114 |
+
# start = 620
|
115 |
+
# elif model_name.split('-')[0] == "ChartGemma":
|
116 |
+
# start = 1024
|
117 |
+
# elif model_name.split('-')[0] == "LLaVA":
|
118 |
+
# start = 581
|
119 |
|
120 |
if activation_map_method == "GradCAM":
|
121 |
# target_layers = vl_gpt.vision_model.vision_tower.blocks
|
|
|
136 |
elif model_name.split('-')[0] == "ChartGemma":
|
137 |
gradcam = AttentionGuidedCAMChartGemma(vl_gpt, target_layers)
|
138 |
|
139 |
+
start = 0
|
140 |
+
if focus == "Visual Encoder":
|
141 |
+
cam_tensors, grid_size = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_pooling_method, focus)
|
142 |
+
else:
|
143 |
+
cam_tensors, grid_size, start = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_pooling_method, focus)
|
144 |
gradcam.remove_hooks()
|
145 |
|
146 |
|
|
|
211 |
clean()
|
212 |
set_seed()
|
213 |
model_utils = LLaVA_Utils()
|
214 |
+
version = model_type.split('-')[1]
|
215 |
+
vl_gpt, tokenizer = model_utils.init_LLaVA(version=version)
|
216 |
+
language_model_max_layer = 32 if version == "1.5" else 28
|
217 |
language_model_best_layer = 10
|
218 |
|
219 |
res = (
|
220 |
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="answer + visualization", label="response_type"),
|
221 |
+
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers min"),
|
222 |
+
gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers max"),
|
223 |
gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
|
224 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
|
225 |
)
|
|
|
291 |
activation_map_output = gr.Gallery(label="activation Map", height=300, columns=1)
|
292 |
|
293 |
with gr.Column():
|
294 |
+
model_selector = gr.Dropdown(choices=["Clip", "ChartGemma-3B", "Janus-1B", "Janus-7B", "LLaVA-1.5-7B", "LLaVA-onevision-qwen2-7b-si"], value="Clip", label="model")
|
295 |
response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
|
296 |
focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
|
297 |
activation_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
|
demo/cam.py
CHANGED
@@ -274,7 +274,8 @@ class AttentionGuidedCAMJanus(AttentionGuidedCAM):
|
|
274 |
# cam_sum shape: [1, seq_len, seq_len]
|
275 |
cam_sum_lst = []
|
276 |
cam_sum_raw = cam_sum
|
277 |
-
|
|
|
278 |
cam_sum = cam_sum_raw[:, i, :] # shape: [1: seq_len]
|
279 |
cam_sum = cam_sum[input_tensor.images_seq_mask].unsqueeze(0) # shape: [1, 576]
|
280 |
print("cam_sum shape: ", cam_sum.shape)
|
@@ -290,7 +291,7 @@ class AttentionGuidedCAMJanus(AttentionGuidedCAM):
|
|
290 |
cam_sum_lst.append(cam_sum)
|
291 |
|
292 |
|
293 |
-
return cam_sum_lst, grid_size
|
294 |
|
295 |
# Aggregate activations and gradients from ALL layers
|
296 |
|
@@ -407,7 +408,7 @@ class AttentionGuidedCAMLLaVA(AttentionGuidedCAM):
|
|
407 |
cam_sum_lst.append(cam_sum)
|
408 |
|
409 |
|
410 |
-
return cam_sum_lst, grid_size
|
411 |
|
412 |
|
413 |
|
@@ -556,7 +557,7 @@ class AttentionGuidedCAMChartGemma(AttentionGuidedCAM):
|
|
556 |
cam_sum_lst.append(cam_sum)
|
557 |
|
558 |
|
559 |
-
return cam_sum_lst, grid_size
|
560 |
|
561 |
|
562 |
|
|
|
274 |
# cam_sum shape: [1, seq_len, seq_len]
|
275 |
cam_sum_lst = []
|
276 |
cam_sum_raw = cam_sum
|
277 |
+
start = 620
|
278 |
+
for i in range(start, cam_sum_raw.shape[1]):
|
279 |
cam_sum = cam_sum_raw[:, i, :] # shape: [1: seq_len]
|
280 |
cam_sum = cam_sum[input_tensor.images_seq_mask].unsqueeze(0) # shape: [1, 576]
|
281 |
print("cam_sum shape: ", cam_sum.shape)
|
|
|
291 |
cam_sum_lst.append(cam_sum)
|
292 |
|
293 |
|
294 |
+
return cam_sum_lst, grid_size, start
|
295 |
|
296 |
# Aggregate activations and gradients from ALL layers
|
297 |
|
|
|
408 |
cam_sum_lst.append(cam_sum)
|
409 |
|
410 |
|
411 |
+
return cam_sum_lst, grid_size, start_idx
|
412 |
|
413 |
|
414 |
|
|
|
557 |
cam_sum_lst.append(cam_sum)
|
558 |
|
559 |
|
560 |
+
return cam_sum_lst, grid_size, start_idx
|
561 |
|
562 |
|
563 |
|
demo/model_utils.py
CHANGED
@@ -2,7 +2,7 @@ import torch
|
|
2 |
import numpy as np
|
3 |
import spaces
|
4 |
from PIL import Image, ImageDraw, ImageFont
|
5 |
-
from transformers import AutoConfig, AutoModelForCausalLM, LlavaForConditionalGeneration, LlavaNextForConditionalGeneration, LlavaNextProcessor, AutoProcessor, PaliGemmaForConditionalGeneration
|
6 |
from transformers import CLIPProcessor, CLIPModel
|
7 |
from janus.models import MultiModalityCausalLM, VLChatProcessor
|
8 |
|
@@ -117,19 +117,29 @@ class LLaVA_Utils(Model_Utils):
|
|
117 |
def __init__(self):
|
118 |
super().__init__()
|
119 |
|
120 |
-
def init_LLaVA(self):
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
return self.vl_gpt, self.tokenizer
|
135 |
|
|
|
2 |
import numpy as np
|
3 |
import spaces
|
4 |
from PIL import Image, ImageDraw, ImageFont
|
5 |
+
from transformers import AutoConfig, AutoModelForCausalLM, LlavaForConditionalGeneration, LlavaOnevisionForConditionalGeneration, LlavaNextForConditionalGeneration, LlavaNextProcessor, AutoProcessor, PaliGemmaForConditionalGeneration
|
6 |
from transformers import CLIPProcessor, CLIPModel
|
7 |
from janus.models import MultiModalityCausalLM, VLChatProcessor
|
8 |
|
|
|
117 |
def __init__(self):
|
118 |
super().__init__()
|
119 |
|
120 |
+
def init_LLaVA(self, version):
|
121 |
+
if version == "1.5":
|
122 |
+
model_path = "llava-hf/llava-1.5-7b-hf"
|
123 |
+
config = AutoConfig.from_pretrained(model_path)
|
124 |
+
|
125 |
+
self.vl_gpt = LlavaForConditionalGeneration.from_pretrained(model_path,
|
126 |
+
low_cpu_mem_usage=True,
|
127 |
+
attn_implementation = 'eager',
|
128 |
+
output_attentions=True
|
129 |
+
)
|
130 |
+
self.vl_gpt, self.dtype, self.cuda_device = set_dtype_device(self.vl_gpt)
|
131 |
+
self.processor = AutoProcessor.from_pretrained(model_path)
|
132 |
+
self.tokenizer = self.processor.tokenizer
|
133 |
+
|
134 |
+
else:
|
135 |
+
model_path = "llava-hf/llava-onevision-qwen2-7b-si-hf"
|
136 |
+
|
137 |
+
self.processor = AutoProcessor.from_pretrained(model_path)
|
138 |
+
|
139 |
+
self.vl_gpt = LlavaOnevisionForConditionalGeneration.from_pretrained(model_path,
|
140 |
+
torch_dtype=torch.float16,
|
141 |
+
low_cpu_mem_usage=True)
|
142 |
+
self.tokenizer = self.processor.tokenizer
|
143 |
|
144 |
return self.vl_gpt, self.tokenizer
|
145 |
|