AustingDong
commited on
Commit
·
ee8653b
1
Parent(s):
f59a9b2
add accumulate methods: Sum or Mult
Browse files- app.py +5 -5
- demo/visualization.py +10 -7
app.py
CHANGED
@@ -56,7 +56,7 @@ def multimodal_understanding(model_type,
|
|
56 |
activation_map_method,
|
57 |
visual_method,
|
58 |
image, question, seed, top_p, temperature, target_token_idx,
|
59 |
-
visualization_layer_min, visualization_layer_max, focus, response_type, chart_type):
|
60 |
# Clear CUDA cache before generating
|
61 |
gc.collect()
|
62 |
if torch.cuda.is_available():
|
@@ -160,7 +160,7 @@ def multimodal_understanding(model_type,
|
|
160 |
gradcam = VisualizationLLaVA(vl_gpt, target_layers)
|
161 |
elif model_name.split('-')[0] == "ChartGemma":
|
162 |
gradcam = VisualizationChartGemma(vl_gpt, target_layers)
|
163 |
-
cam_tensors, grid_size, start = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, i, visual_method, focus)
|
164 |
cam_grid = cam_tensors.reshape(grid_size, grid_size)
|
165 |
cam_i = generate_gradcam(cam_grid, image)
|
166 |
cam_i = add_title_to_image(cam_i, input_ids_decoded[start + i])
|
@@ -168,7 +168,7 @@ def multimodal_understanding(model_type,
|
|
168 |
gradcam.remove_hooks()
|
169 |
i += 1
|
170 |
else:
|
171 |
-
cam_tensors, grid_size, start = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_method, focus)
|
172 |
if target_token_idx != -1:
|
173 |
input_text_decoded = input_ids_decoded[start + target_token_idx]
|
174 |
for i, cam_tensor in enumerate(cam_tensors):
|
@@ -379,7 +379,7 @@ with gr.Blocks() as demo:
|
|
379 |
response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
|
380 |
focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
|
381 |
activation_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="visualization type")
|
382 |
-
|
383 |
visual_method = gr.Dropdown(choices=["CLS", "max", "avg"], value="CLS", label="visual pooling method")
|
384 |
|
385 |
|
@@ -512,7 +512,7 @@ with gr.Blocks() as demo:
|
|
512 |
understanding_button.click(
|
513 |
multimodal_understanding,
|
514 |
inputs=[model_selector, activation_map_method, visual_method, image_input, question_input, und_seed_input, top_p, temperature, target_token_idx,
|
515 |
-
visualization_layers_min, visualization_layers_max, focus, response_type, chart_type],
|
516 |
outputs=[understanding_output, activation_map_output, understanding_target_token_decoded_output]
|
517 |
)
|
518 |
|
|
|
56 |
activation_map_method,
|
57 |
visual_method,
|
58 |
image, question, seed, top_p, temperature, target_token_idx,
|
59 |
+
visualization_layer_min, visualization_layer_max, focus, response_type, chart_type, accumulate_method):
|
60 |
# Clear CUDA cache before generating
|
61 |
gc.collect()
|
62 |
if torch.cuda.is_available():
|
|
|
160 |
gradcam = VisualizationLLaVA(vl_gpt, target_layers)
|
161 |
elif model_name.split('-')[0] == "ChartGemma":
|
162 |
gradcam = VisualizationChartGemma(vl_gpt, target_layers)
|
163 |
+
cam_tensors, grid_size, start = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, i, visual_method, focus, accumulate_method)
|
164 |
cam_grid = cam_tensors.reshape(grid_size, grid_size)
|
165 |
cam_i = generate_gradcam(cam_grid, image)
|
166 |
cam_i = add_title_to_image(cam_i, input_ids_decoded[start + i])
|
|
|
168 |
gradcam.remove_hooks()
|
169 |
i += 1
|
170 |
else:
|
171 |
+
cam_tensors, grid_size, start = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_method, focus, accumulate_method)
|
172 |
if target_token_idx != -1:
|
173 |
input_text_decoded = input_ids_decoded[start + target_token_idx]
|
174 |
for i, cam_tensor in enumerate(cam_tensors):
|
|
|
379 |
response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
|
380 |
focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
|
381 |
activation_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="visualization type")
|
382 |
+
accumulate_method = gr.Dropdown(choices=["sum", "mult"], value="sum", label="layers accumulate method")
|
383 |
visual_method = gr.Dropdown(choices=["CLS", "max", "avg"], value="CLS", label="visual pooling method")
|
384 |
|
385 |
|
|
|
512 |
understanding_button.click(
|
513 |
multimodal_understanding,
|
514 |
inputs=[model_selector, activation_map_method, visual_method, image_input, question_input, und_seed_input, top_p, temperature, target_token_idx,
|
515 |
+
visualization_layers_min, visualization_layers_max, focus, response_type, chart_type, accumulate_method],
|
516 |
outputs=[understanding_output, activation_map_output, understanding_target_token_decoded_output]
|
517 |
)
|
518 |
|
demo/visualization.py
CHANGED
@@ -196,7 +196,7 @@ class Visualization:
|
|
196 |
cam_sum_lst.append(cam_sum)
|
197 |
return cam_sum_lst, grid_size
|
198 |
|
199 |
-
def
|
200 |
cam_sum_lst = []
|
201 |
for i in range(start_idx, cams[0].shape[1]):
|
202 |
cam_sum = None
|
@@ -217,7 +217,10 @@ class Visualization:
|
|
217 |
if cam_sum == None:
|
218 |
cam_sum = cam_reshaped
|
219 |
else:
|
220 |
-
|
|
|
|
|
|
|
221 |
|
222 |
cam_sum = (cam_sum - cam_sum.min()) / (cam_sum.max() - cam_sum.min())
|
223 |
cam_sum_lst.append(cam_sum)
|
@@ -316,7 +319,7 @@ class VisualizationJanus(Visualization):
|
|
316 |
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
317 |
|
318 |
@spaces.GPU(duration=120)
|
319 |
-
def generate_cam(self, input_tensor, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder"):
|
320 |
|
321 |
self.setup_grads()
|
322 |
|
@@ -368,7 +371,7 @@ class VisualizationLLaVA(Visualization):
|
|
368 |
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
369 |
|
370 |
@spaces.GPU(duration=120)
|
371 |
-
def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder"):
|
372 |
|
373 |
self.setup_grads()
|
374 |
self.forward_backward(inputs)
|
@@ -388,7 +391,7 @@ class VisualizationLLaVA(Visualization):
|
|
388 |
# Aggregate activations and gradients from ALL layers
|
389 |
start_idx = last + 1
|
390 |
cams = self.attn_guided_cam()
|
391 |
-
cam_sum_lst, grid_size = self.
|
392 |
|
393 |
return cam_sum_lst, grid_size, start_idx
|
394 |
|
@@ -424,7 +427,7 @@ class VisualizationChartGemma(Visualization):
|
|
424 |
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
425 |
|
426 |
@spaces.GPU(duration=120)
|
427 |
-
def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder"):
|
428 |
|
429 |
# Forward pass
|
430 |
self.setup_grads()
|
@@ -453,7 +456,7 @@ class VisualizationChartGemma(Visualization):
|
|
453 |
elif focus == "Language Model":
|
454 |
|
455 |
cams = self.attn_guided_cam()
|
456 |
-
cam_sum_lst, grid_size = self.
|
457 |
|
458 |
# cams shape: [layers, 1, seq_len, seq_len]
|
459 |
|
|
|
196 |
cam_sum_lst.append(cam_sum)
|
197 |
return cam_sum_lst, grid_size
|
198 |
|
199 |
+
def process_multiple_acc(self, cams, start_idx, images_seq_mask, normalize=False, accumulate_method="sum"):
|
200 |
cam_sum_lst = []
|
201 |
for i in range(start_idx, cams[0].shape[1]):
|
202 |
cam_sum = None
|
|
|
217 |
if cam_sum == None:
|
218 |
cam_sum = cam_reshaped
|
219 |
else:
|
220 |
+
if accumulate_method == "sum":
|
221 |
+
cam_sum += cam_reshaped
|
222 |
+
elif accumulate_method == "mult":
|
223 |
+
cam_sum *= cam_reshaped + 1
|
224 |
|
225 |
cam_sum = (cam_sum - cam_sum.min()) / (cam_sum.max() - cam_sum.min())
|
226 |
cam_sum_lst.append(cam_sum)
|
|
|
319 |
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
320 |
|
321 |
@spaces.GPU(duration=120)
|
322 |
+
def generate_cam(self, input_tensor, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder", accumulate_method="sum"):
|
323 |
|
324 |
self.setup_grads()
|
325 |
|
|
|
371 |
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
372 |
|
373 |
@spaces.GPU(duration=120)
|
374 |
+
def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder", accumulate_method="sum"):
|
375 |
|
376 |
self.setup_grads()
|
377 |
self.forward_backward(inputs)
|
|
|
391 |
# Aggregate activations and gradients from ALL layers
|
392 |
start_idx = last + 1
|
393 |
cams = self.attn_guided_cam()
|
394 |
+
cam_sum_lst, grid_size = self.process_multiple_acc(cams, start_idx, images_seq_mask, accumulate_method=accumulate_method)
|
395 |
|
396 |
return cam_sum_lst, grid_size, start_idx
|
397 |
|
|
|
427 |
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
428 |
|
429 |
@spaces.GPU(duration=120)
|
430 |
+
def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder", accumulate_method="sum"):
|
431 |
|
432 |
# Forward pass
|
433 |
self.setup_grads()
|
|
|
456 |
elif focus == "Language Model":
|
457 |
|
458 |
cams = self.attn_guided_cam()
|
459 |
+
cam_sum_lst, grid_size = self.process_multiple_acc(cams, start_idx, images_seq_mask, accumulate_method=accumulate_method)
|
460 |
|
461 |
# cams shape: [layers, 1, seq_len, seq_len]
|
462 |
|