AustingDong commited on
Commit
a594e78
·
1 Parent(s): 8bfad75
Files changed (3) hide show
  1. app.py +17 -12
  2. demo/cam.py +5 -4
  3. demo/model_utils.py +24 -14
app.py CHANGED
@@ -110,12 +110,12 @@ def multimodal_understanding(model_type,
110
 
111
  input_ids = prepare_inputs.input_ids[0].cpu().tolist()
112
  input_ids_decoded = [tokenizer.decode([input_ids[i]]) for i in range(len(input_ids))]
113
- if model_name.split('-')[0] == "Janus":
114
- start = 620
115
- elif model_name.split('-')[0] == "ChartGemma":
116
- start = 1024
117
- elif model_name.split('-')[0] == "LLaVA":
118
- start = 581
119
 
120
  if activation_map_method == "GradCAM":
121
  # target_layers = vl_gpt.vision_model.vision_tower.blocks
@@ -136,7 +136,11 @@ def multimodal_understanding(model_type,
136
  elif model_name.split('-')[0] == "ChartGemma":
137
  gradcam = AttentionGuidedCAMChartGemma(vl_gpt, target_layers)
138
 
139
- cam_tensors, grid_size = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_pooling_method, focus)
 
 
 
 
140
  gradcam.remove_hooks()
141
 
142
 
@@ -207,14 +211,15 @@ def model_slider_change(model_type):
207
  clean()
208
  set_seed()
209
  model_utils = LLaVA_Utils()
210
- vl_gpt, tokenizer = model_utils.init_LLaVA()
211
- language_model_max_layer = 32
 
212
  language_model_best_layer = 10
213
 
214
  res = (
215
  gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="answer + visualization", label="response_type"),
216
- gr.Slider(minimum=1, maximum=32, value=10, step=1, label="visualization layers min"),
217
- gr.Slider(minimum=1, maximum=32, value=10, step=1, label="visualization layers max"),
218
  gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
219
  gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
220
  )
@@ -286,7 +291,7 @@ with gr.Blocks() as demo:
286
  activation_map_output = gr.Gallery(label="activation Map", height=300, columns=1)
287
 
288
  with gr.Column():
289
- model_selector = gr.Dropdown(choices=["Clip", "ChartGemma-3B", "Janus-1B", "Janus-7B", "LLaVA-1.5-7B"], value="Clip", label="model")
290
  response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
291
  focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
292
  activation_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
 
110
 
111
  input_ids = prepare_inputs.input_ids[0].cpu().tolist()
112
  input_ids_decoded = [tokenizer.decode([input_ids[i]]) for i in range(len(input_ids))]
113
+ # if model_name.split('-')[0] == "Janus":
114
+ # start = 620
115
+ # elif model_name.split('-')[0] == "ChartGemma":
116
+ # start = 1024
117
+ # elif model_name.split('-')[0] == "LLaVA":
118
+ # start = 581
119
 
120
  if activation_map_method == "GradCAM":
121
  # target_layers = vl_gpt.vision_model.vision_tower.blocks
 
136
  elif model_name.split('-')[0] == "ChartGemma":
137
  gradcam = AttentionGuidedCAMChartGemma(vl_gpt, target_layers)
138
 
139
+ start = 0
140
+ if focus == "Visual Encoder":
141
+ cam_tensors, grid_size = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_pooling_method, focus)
142
+ else:
143
+ cam_tensors, grid_size, start = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_pooling_method, focus)
144
  gradcam.remove_hooks()
145
 
146
 
 
211
  clean()
212
  set_seed()
213
  model_utils = LLaVA_Utils()
214
+ version = model_type.split('-')[1]
215
+ vl_gpt, tokenizer = model_utils.init_LLaVA(version=version)
216
+ language_model_max_layer = 32 if version == "1.5" else 28
217
  language_model_best_layer = 10
218
 
219
  res = (
220
  gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="answer + visualization", label="response_type"),
221
+ gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers min"),
222
+ gr.Slider(minimum=1, maximum=language_model_max_layer, value=language_model_best_layer, step=1, label="visualization layers max"),
223
  gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
224
  gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
225
  )
 
291
  activation_map_output = gr.Gallery(label="activation Map", height=300, columns=1)
292
 
293
  with gr.Column():
294
+ model_selector = gr.Dropdown(choices=["Clip", "ChartGemma-3B", "Janus-1B", "Janus-7B", "LLaVA-1.5-7B", "LLaVA-onevision-qwen2-7b-si"], value="Clip", label="model")
295
  response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
296
  focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
297
  activation_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="activation map type")
demo/cam.py CHANGED
@@ -274,7 +274,8 @@ class AttentionGuidedCAMJanus(AttentionGuidedCAM):
274
  # cam_sum shape: [1, seq_len, seq_len]
275
  cam_sum_lst = []
276
  cam_sum_raw = cam_sum
277
- for i in range(620, cam_sum_raw.shape[1]):
 
278
  cam_sum = cam_sum_raw[:, i, :] # shape: [1: seq_len]
279
  cam_sum = cam_sum[input_tensor.images_seq_mask].unsqueeze(0) # shape: [1, 576]
280
  print("cam_sum shape: ", cam_sum.shape)
@@ -290,7 +291,7 @@ class AttentionGuidedCAMJanus(AttentionGuidedCAM):
290
  cam_sum_lst.append(cam_sum)
291
 
292
 
293
- return cam_sum_lst, grid_size
294
 
295
  # Aggregate activations and gradients from ALL layers
296
 
@@ -407,7 +408,7 @@ class AttentionGuidedCAMLLaVA(AttentionGuidedCAM):
407
  cam_sum_lst.append(cam_sum)
408
 
409
 
410
- return cam_sum_lst, grid_size
411
 
412
 
413
 
@@ -556,7 +557,7 @@ class AttentionGuidedCAMChartGemma(AttentionGuidedCAM):
556
  cam_sum_lst.append(cam_sum)
557
 
558
 
559
- return cam_sum_lst, grid_size
560
 
561
 
562
 
 
274
  # cam_sum shape: [1, seq_len, seq_len]
275
  cam_sum_lst = []
276
  cam_sum_raw = cam_sum
277
+ start = 620
278
+ for i in range(start, cam_sum_raw.shape[1]):
279
  cam_sum = cam_sum_raw[:, i, :] # shape: [1: seq_len]
280
  cam_sum = cam_sum[input_tensor.images_seq_mask].unsqueeze(0) # shape: [1, 576]
281
  print("cam_sum shape: ", cam_sum.shape)
 
291
  cam_sum_lst.append(cam_sum)
292
 
293
 
294
+ return cam_sum_lst, grid_size, start
295
 
296
  # Aggregate activations and gradients from ALL layers
297
 
 
408
  cam_sum_lst.append(cam_sum)
409
 
410
 
411
+ return cam_sum_lst, grid_size, start_idx
412
 
413
 
414
 
 
557
  cam_sum_lst.append(cam_sum)
558
 
559
 
560
+ return cam_sum_lst, grid_size, start_idx
561
 
562
 
563
 
demo/model_utils.py CHANGED
@@ -2,7 +2,7 @@ import torch
2
  import numpy as np
3
  import spaces
4
  from PIL import Image, ImageDraw, ImageFont
5
- from transformers import AutoConfig, AutoModelForCausalLM, LlavaForConditionalGeneration, LlavaNextForConditionalGeneration, LlavaNextProcessor, AutoProcessor, PaliGemmaForConditionalGeneration
6
  from transformers import CLIPProcessor, CLIPModel
7
  from janus.models import MultiModalityCausalLM, VLChatProcessor
8
 
@@ -117,19 +117,29 @@ class LLaVA_Utils(Model_Utils):
117
  def __init__(self):
118
  super().__init__()
119
 
120
- def init_LLaVA(self):
121
-
122
- model_path = "llava-hf/llava-1.5-7b-hf"
123
- config = AutoConfig.from_pretrained(model_path)
124
-
125
- self.vl_gpt = LlavaForConditionalGeneration.from_pretrained(model_path,
126
- low_cpu_mem_usage=True,
127
- attn_implementation = 'eager',
128
- output_attentions=True
129
- )
130
- self.vl_gpt, self.dtype, self.cuda_device = set_dtype_device(self.vl_gpt)
131
- self.processor = AutoProcessor.from_pretrained(model_path)
132
- self.tokenizer = self.processor.tokenizer
 
 
 
 
 
 
 
 
 
 
133
 
134
  return self.vl_gpt, self.tokenizer
135
 
 
2
  import numpy as np
3
  import spaces
4
  from PIL import Image, ImageDraw, ImageFont
5
+ from transformers import AutoConfig, AutoModelForCausalLM, LlavaForConditionalGeneration, LlavaOnevisionForConditionalGeneration, LlavaNextForConditionalGeneration, LlavaNextProcessor, AutoProcessor, PaliGemmaForConditionalGeneration
6
  from transformers import CLIPProcessor, CLIPModel
7
  from janus.models import MultiModalityCausalLM, VLChatProcessor
8
 
 
117
  def __init__(self):
118
  super().__init__()
119
 
120
+ def init_LLaVA(self, version):
121
+ if version == "1.5":
122
+ model_path = "llava-hf/llava-1.5-7b-hf"
123
+ config = AutoConfig.from_pretrained(model_path)
124
+
125
+ self.vl_gpt = LlavaForConditionalGeneration.from_pretrained(model_path,
126
+ low_cpu_mem_usage=True,
127
+ attn_implementation = 'eager',
128
+ output_attentions=True
129
+ )
130
+ self.vl_gpt, self.dtype, self.cuda_device = set_dtype_device(self.vl_gpt)
131
+ self.processor = AutoProcessor.from_pretrained(model_path)
132
+ self.tokenizer = self.processor.tokenizer
133
+
134
+ else:
135
+ model_path = "llava-hf/llava-onevision-qwen2-7b-si-hf"
136
+
137
+ self.processor = AutoProcessor.from_pretrained(model_path)
138
+
139
+ self.vl_gpt = LlavaOnevisionForConditionalGeneration.from_pretrained(model_path,
140
+ torch_dtype=torch.float16,
141
+ low_cpu_mem_usage=True)
142
+ self.tokenizer = self.processor.tokenizer
143
 
144
  return self.vl_gpt, self.tokenizer
145