AustingDong commited on
Commit
ee8653b
·
1 Parent(s): f59a9b2

add accumulate methods: Sum or Mult

Browse files
Files changed (2) hide show
  1. app.py +5 -5
  2. demo/visualization.py +10 -7
app.py CHANGED
@@ -56,7 +56,7 @@ def multimodal_understanding(model_type,
56
  activation_map_method,
57
  visual_method,
58
  image, question, seed, top_p, temperature, target_token_idx,
59
- visualization_layer_min, visualization_layer_max, focus, response_type, chart_type):
60
  # Clear CUDA cache before generating
61
  gc.collect()
62
  if torch.cuda.is_available():
@@ -160,7 +160,7 @@ def multimodal_understanding(model_type,
160
  gradcam = VisualizationLLaVA(vl_gpt, target_layers)
161
  elif model_name.split('-')[0] == "ChartGemma":
162
  gradcam = VisualizationChartGemma(vl_gpt, target_layers)
163
- cam_tensors, grid_size, start = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, i, visual_method, focus)
164
  cam_grid = cam_tensors.reshape(grid_size, grid_size)
165
  cam_i = generate_gradcam(cam_grid, image)
166
  cam_i = add_title_to_image(cam_i, input_ids_decoded[start + i])
@@ -168,7 +168,7 @@ def multimodal_understanding(model_type,
168
  gradcam.remove_hooks()
169
  i += 1
170
  else:
171
- cam_tensors, grid_size, start = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_method, focus)
172
  if target_token_idx != -1:
173
  input_text_decoded = input_ids_decoded[start + target_token_idx]
174
  for i, cam_tensor in enumerate(cam_tensors):
@@ -379,7 +379,7 @@ with gr.Blocks() as demo:
379
  response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
380
  focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
381
  activation_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="visualization type")
382
- # activation_function = gr.Dropdown(choices=["softmax", "sigmoid"], value="softmax", label="activation function")
383
  visual_method = gr.Dropdown(choices=["CLS", "max", "avg"], value="CLS", label="visual pooling method")
384
 
385
 
@@ -512,7 +512,7 @@ with gr.Blocks() as demo:
512
  understanding_button.click(
513
  multimodal_understanding,
514
  inputs=[model_selector, activation_map_method, visual_method, image_input, question_input, und_seed_input, top_p, temperature, target_token_idx,
515
- visualization_layers_min, visualization_layers_max, focus, response_type, chart_type],
516
  outputs=[understanding_output, activation_map_output, understanding_target_token_decoded_output]
517
  )
518
 
 
56
  activation_map_method,
57
  visual_method,
58
  image, question, seed, top_p, temperature, target_token_idx,
59
+ visualization_layer_min, visualization_layer_max, focus, response_type, chart_type, accumulate_method):
60
  # Clear CUDA cache before generating
61
  gc.collect()
62
  if torch.cuda.is_available():
 
160
  gradcam = VisualizationLLaVA(vl_gpt, target_layers)
161
  elif model_name.split('-')[0] == "ChartGemma":
162
  gradcam = VisualizationChartGemma(vl_gpt, target_layers)
163
+ cam_tensors, grid_size, start = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, i, visual_method, focus, accumulate_method)
164
  cam_grid = cam_tensors.reshape(grid_size, grid_size)
165
  cam_i = generate_gradcam(cam_grid, image)
166
  cam_i = add_title_to_image(cam_i, input_ids_decoded[start + i])
 
168
  gradcam.remove_hooks()
169
  i += 1
170
  else:
171
+ cam_tensors, grid_size, start = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_method, focus, accumulate_method)
172
  if target_token_idx != -1:
173
  input_text_decoded = input_ids_decoded[start + target_token_idx]
174
  for i, cam_tensor in enumerate(cam_tensors):
 
379
  response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
380
  focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
381
  activation_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="visualization type")
382
+ accumulate_method = gr.Dropdown(choices=["sum", "mult"], value="sum", label="layers accumulate method")
383
  visual_method = gr.Dropdown(choices=["CLS", "max", "avg"], value="CLS", label="visual pooling method")
384
 
385
 
 
512
  understanding_button.click(
513
  multimodal_understanding,
514
  inputs=[model_selector, activation_map_method, visual_method, image_input, question_input, und_seed_input, top_p, temperature, target_token_idx,
515
+ visualization_layers_min, visualization_layers_max, focus, response_type, chart_type, accumulate_method],
516
  outputs=[understanding_output, activation_map_output, understanding_target_token_decoded_output]
517
  )
518
 
demo/visualization.py CHANGED
@@ -196,7 +196,7 @@ class Visualization:
196
  cam_sum_lst.append(cam_sum)
197
  return cam_sum_lst, grid_size
198
 
199
- def process_multiple_withsum(self, cams, start_idx, images_seq_mask, normalize=False):
200
  cam_sum_lst = []
201
  for i in range(start_idx, cams[0].shape[1]):
202
  cam_sum = None
@@ -217,7 +217,10 @@ class Visualization:
217
  if cam_sum == None:
218
  cam_sum = cam_reshaped
219
  else:
220
- cam_sum += cam_reshaped
 
 
 
221
 
222
  cam_sum = (cam_sum - cam_sum.min()) / (cam_sum.max() - cam_sum.min())
223
  cam_sum_lst.append(cam_sum)
@@ -316,7 +319,7 @@ class VisualizationJanus(Visualization):
316
  self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
317
 
318
  @spaces.GPU(duration=120)
319
- def generate_cam(self, input_tensor, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder"):
320
 
321
  self.setup_grads()
322
 
@@ -368,7 +371,7 @@ class VisualizationLLaVA(Visualization):
368
  self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
369
 
370
  @spaces.GPU(duration=120)
371
- def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder"):
372
 
373
  self.setup_grads()
374
  self.forward_backward(inputs)
@@ -388,7 +391,7 @@ class VisualizationLLaVA(Visualization):
388
  # Aggregate activations and gradients from ALL layers
389
  start_idx = last + 1
390
  cams = self.attn_guided_cam()
391
- cam_sum_lst, grid_size = self.process_multiple_withsum(cams, start_idx, images_seq_mask)
392
 
393
  return cam_sum_lst, grid_size, start_idx
394
 
@@ -424,7 +427,7 @@ class VisualizationChartGemma(Visualization):
424
  self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
425
 
426
  @spaces.GPU(duration=120)
427
- def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder"):
428
 
429
  # Forward pass
430
  self.setup_grads()
@@ -453,7 +456,7 @@ class VisualizationChartGemma(Visualization):
453
  elif focus == "Language Model":
454
 
455
  cams = self.attn_guided_cam()
456
- cam_sum_lst, grid_size = self.process_multiple_withsum(cams, start_idx, images_seq_mask)
457
 
458
  # cams shape: [layers, 1, seq_len, seq_len]
459
 
 
196
  cam_sum_lst.append(cam_sum)
197
  return cam_sum_lst, grid_size
198
 
199
+ def process_multiple_acc(self, cams, start_idx, images_seq_mask, normalize=False, accumulate_method="sum"):
200
  cam_sum_lst = []
201
  for i in range(start_idx, cams[0].shape[1]):
202
  cam_sum = None
 
217
  if cam_sum == None:
218
  cam_sum = cam_reshaped
219
  else:
220
+ if accumulate_method == "sum":
221
+ cam_sum += cam_reshaped
222
+ elif accumulate_method == "mult":
223
+ cam_sum *= cam_reshaped + 1
224
 
225
  cam_sum = (cam_sum - cam_sum.min()) / (cam_sum.max() - cam_sum.min())
226
  cam_sum_lst.append(cam_sum)
 
319
  self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
320
 
321
  @spaces.GPU(duration=120)
322
+ def generate_cam(self, input_tensor, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder", accumulate_method="sum"):
323
 
324
  self.setup_grads()
325
 
 
371
  self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
372
 
373
  @spaces.GPU(duration=120)
374
+ def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder", accumulate_method="sum"):
375
 
376
  self.setup_grads()
377
  self.forward_backward(inputs)
 
391
  # Aggregate activations and gradients from ALL layers
392
  start_idx = last + 1
393
  cams = self.attn_guided_cam()
394
+ cam_sum_lst, grid_size = self.process_multiple_acc(cams, start_idx, images_seq_mask, accumulate_method=accumulate_method)
395
 
396
  return cam_sum_lst, grid_size, start_idx
397
 
 
427
  self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
428
 
429
  @spaces.GPU(duration=120)
430
+ def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder", accumulate_method="sum"):
431
 
432
  # Forward pass
433
  self.setup_grads()
 
456
  elif focus == "Language Model":
457
 
458
  cams = self.attn_guided_cam()
459
+ cam_sum_lst, grid_size = self.process_multiple_acc(cams, start_idx, images_seq_mask, accumulate_method=accumulate_method)
460
 
461
  # cams shape: [layers, 1, seq_len, seq_len]
462