AustingDong commited on
Commit
63b5fc2
·
1 Parent(s): 9cae7ef
Files changed (2) hide show
  1. app.py +3 -3
  2. demo/visualization.py +27 -57
app.py CHANGED
@@ -258,7 +258,7 @@ with gr.Blocks() as demo:
258
  activation_map_output = gr.Gallery(label="Visualization", height=500, columns=1, preview=True)
259
 
260
  with gr.Row():
261
- chart_type = gr.Textbox(label="Chart Type")
262
  understanding_output = gr.Textbox(label="Answer")
263
 
264
  with gr.Row():
@@ -266,7 +266,7 @@ with gr.Blocks() as demo:
266
  with gr.Column():
267
  model_selector = gr.Dropdown(choices=["ChartGemma-3B", "Janus-Pro-1B", "Janus-Pro-7B", "LLaVA-1.5-7B"], value="ChartGemma-3B", label="model")
268
  test_selector = gr.Dropdown(choices=["mini-VLAT", "VLAT", "VLAT-old"], value="mini-VLAT", label="test")
269
- question_input = gr.Textbox(label="Input Prompt")
270
  und_seed_input = gr.Number(label="Seed", precision=0, value=42)
271
  top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")
272
  temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="temperature")
@@ -275,7 +275,7 @@ with gr.Blocks() as demo:
275
 
276
  with gr.Column():
277
  response_type = gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="answer + visualization", label="response_type")
278
- focus = gr.Dropdown(choices=["question", "question + answer"], value="question + answer", label="focus")
279
  activation_map_method = gr.Dropdown(choices=["AG-CAM"], value="AG-CAM", label="visualization type")
280
  accumulate_method = gr.Dropdown(choices=["sum", "mult"], value="sum", label="layers accumulate method")
281
  visual_method = gr.Dropdown(choices=["softmax", "sigmoid"], value="softmax", label="activation function")
 
258
  activation_map_output = gr.Gallery(label="Visualization", height=500, columns=1, preview=True)
259
 
260
  with gr.Row():
261
+ question_input = gr.Textbox(label="Question")
262
  understanding_output = gr.Textbox(label="Answer")
263
 
264
  with gr.Row():
 
266
  with gr.Column():
267
  model_selector = gr.Dropdown(choices=["ChartGemma-3B", "Janus-Pro-1B", "Janus-Pro-7B", "LLaVA-1.5-7B"], value="ChartGemma-3B", label="model")
268
  test_selector = gr.Dropdown(choices=["mini-VLAT", "VLAT", "VLAT-old"], value="mini-VLAT", label="test")
269
+ chart_type = gr.Textbox(label="Chart Type", value="Any")
270
  und_seed_input = gr.Number(label="Seed", precision=0, value=42)
271
  top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")
272
  temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="temperature")
 
275
 
276
  with gr.Column():
277
  response_type = gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="answer + visualization", label="response_type")
278
+ focus = gr.Dropdown(choices=["question", "question + answer"], value="question", label="focus")
279
  activation_map_method = gr.Dropdown(choices=["AG-CAM"], value="AG-CAM", label="visualization type")
280
  accumulate_method = gr.Dropdown(choices=["sum", "mult"], value="sum", label="layers accumulate method")
281
  visual_method = gr.Dropdown(choices=["softmax", "sigmoid"], value="softmax", label="activation function")
demo/visualization.py CHANGED
@@ -296,7 +296,7 @@ class VisualizationJanus(Visualization):
296
  self._modify_layers()
297
  self._register_hooks_activations()
298
 
299
- def forward_backward(self, input_tensor, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder"):
300
  # Forward
301
  image_embeddings, inputs_embeddings, outputs = self.model(input_tensor, tokenizer, temperature, top_p)
302
  print(input_tensor.keys())
@@ -304,24 +304,18 @@ class VisualizationJanus(Visualization):
304
  start_idx = 620
305
  self.model.zero_grad()
306
 
 
 
 
 
 
 
307
 
308
-
309
- if focus == "Visual Encoder":
310
- loss = outputs.logits.max(dim=-1).values[0, start_idx + target_token_idx]
311
- loss.backward()
312
-
313
- elif focus == "Language Model":
314
- if target_token_idx == -1:
315
- loss = outputs.logits.max(dim=-1).values.sum()
316
- else:
317
- loss = outputs.logits.max(dim=-1).values[0, start_idx + target_token_idx]
318
- loss.backward()
319
-
320
- self.activations = self.activations = [layer.attn_sigmoid_weights for layer in self.target_layers] if visual_method == "sigmoid" else [layer.get_attn_map() for layer in self.target_layers]
321
- self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
322
 
323
  @spaces.GPU(duration=120)
324
- def generate_cam(self, input_tensor, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder", accumulate_method="sum"):
325
 
326
  self.setup_grads()
327
 
@@ -329,25 +323,14 @@ class VisualizationJanus(Visualization):
329
  self.forward_backward(input_tensor, tokenizer, temperature, top_p, target_token_idx, visual_method, focus)
330
 
331
  start_idx = 620
332
- if focus == "Visual Encoder":
333
-
334
- cam_sum = self.grad_cam_vis()
335
- cam_sum, grid_size = self.process(cam_sum)
336
- return cam_sum, grid_size, start_idx
337
-
338
- elif focus == "Language Model":
339
-
340
- # cam_sum = self.grad_cam_llm(mean_inside=True)
341
 
342
- images_seq_mask = input_tensor.images_seq_mask[0].detach().cpu().tolist()
343
-
344
- # cam_sum_lst, grid_size = self.process_multiple(cam_sum, start_idx, images_seq_mask)
345
 
346
- cams = self.attn_guided_cam()
347
- cam_sum_lst, grid_size = self.process_multiple_acc(cams, start_idx, images_seq_mask, accumulate_method=accumulate_method)
348
 
349
 
350
- return cam_sum_lst, grid_size, start_idx
351
 
352
 
353
 
@@ -371,13 +354,14 @@ class VisualizationLLaVA(Visualization):
371
  self.model.zero_grad()
372
  print("outputs_raw", outputs_raw)
373
 
374
- loss = outputs_raw.logits.max(dim=-1).values.sum()
 
375
  loss.backward()
376
  self.activations = [layer.get_attn_map() for layer in self.target_layers]
377
  self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
378
 
379
  @spaces.GPU(duration=120)
380
- def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder", accumulate_method="sum"):
381
 
382
  self.setup_grads()
383
  self.forward_backward(inputs)
@@ -416,29 +400,23 @@ class VisualizationChartGemma(Visualization):
416
 
417
  def forward_backward(self, inputs, focus, start_idx, target_token_idx, visual_method="softmax"):
418
  outputs_raw = self.model(**inputs, output_hidden_states=True)
419
- if focus == "Visual Encoder":
420
-
421
- self.model.zero_grad()
422
-
423
- loss = outputs_raw.logits.max(dim=-1).values[0, start_idx + target_token_idx]
424
- loss.backward()
425
-
426
- elif focus == "Language Model":
427
  self.model.zero_grad()
428
  print("logits shape:", outputs_raw.logits.shape)
429
  print("start_idx:", start_idx)
430
- if target_token_idx == -1:
431
- logits_prob = F.softmax(outputs_raw.logits, dim=-1)
432
- loss = logits_prob.max(dim=-1).values.sum()
433
 
 
 
 
 
434
  else:
435
- loss = outputs_raw.logits.max(dim=-1).values[0, start_idx + target_token_idx]
436
  loss.backward()
437
  self.activations = [layer.attn_sigmoid_weights for layer in self.target_layers] if visual_method == "sigmoid" else [layer.get_attn_map() for layer in self.target_layers]
438
  self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
439
 
440
  @spaces.GPU(duration=120)
441
- def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder", accumulate_method="sum"):
442
 
443
  # Forward pass
444
  self.setup_grads()
@@ -457,19 +435,11 @@ class VisualizationChartGemma(Visualization):
457
 
458
 
459
  self.forward_backward(inputs, focus, start_idx, target_token_idx, visual_method)
460
- if focus == "Visual Encoder":
461
-
462
- cam_sum = self.grad_cam_vis()
463
- cam_sum, grid_size = self.process(cam_sum, remove_cls=False)
464
-
465
- return cam_sum, grid_size, start_idx
466
 
467
- elif focus == "Language Model":
468
-
469
- cams = self.attn_guided_cam()
470
- cam_sum_lst, grid_size = self.process_multiple_acc(cams, start_idx, images_seq_mask, accumulate_method=accumulate_method)
471
 
472
- # cams shape: [layers, 1, seq_len, seq_len]
473
 
474
 
475
 
 
296
  self._modify_layers()
297
  self._register_hooks_activations()
298
 
299
+ def forward_backward(self, input_tensor, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Language Model"):
300
  # Forward
301
  image_embeddings, inputs_embeddings, outputs = self.model(input_tensor, tokenizer, temperature, top_p)
302
  print(input_tensor.keys())
 
304
  start_idx = 620
305
  self.model.zero_grad()
306
 
307
+ logits = outputs.logits
308
+ if target_token_idx == -1:
309
+ loss = logits.max(dim=-1).values.sum()
310
+ else:
311
+ loss = logits.max(dim=-1).values[0, start_idx + target_token_idx]
312
+ loss.backward()
313
 
314
+ self.activations = self.activations = [layer.attn_sigmoid_weights for layer in self.target_layers] if visual_method == "sigmoid" else [layer.get_attn_map() for layer in self.target_layers]
315
+ self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
  @spaces.GPU(duration=120)
318
+ def generate_cam(self, input_tensor, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Language Model", accumulate_method="sum"):
319
 
320
  self.setup_grads()
321
 
 
323
  self.forward_backward(input_tensor, tokenizer, temperature, top_p, target_token_idx, visual_method, focus)
324
 
325
  start_idx = 620
 
 
 
 
 
 
 
 
 
326
 
327
+ images_seq_mask = input_tensor.images_seq_mask[0].detach().cpu().tolist()
 
 
328
 
329
+ cams = self.attn_guided_cam()
330
+ cam_sum_lst, grid_size = self.process_multiple_acc(cams, start_idx, images_seq_mask, accumulate_method=accumulate_method)
331
 
332
 
333
+ return cam_sum_lst, grid_size, start_idx
334
 
335
 
336
 
 
354
  self.model.zero_grad()
355
  print("outputs_raw", outputs_raw)
356
 
357
+ logits = outputs_raw.logits
358
+ loss = logits.max(dim=-1).values.sum()
359
  loss.backward()
360
  self.activations = [layer.get_attn_map() for layer in self.target_layers]
361
  self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
362
 
363
  @spaces.GPU(duration=120)
364
+ def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Language Model", accumulate_method="sum"):
365
 
366
  self.setup_grads()
367
  self.forward_backward(inputs)
 
400
 
401
  def forward_backward(self, inputs, focus, start_idx, target_token_idx, visual_method="softmax"):
402
  outputs_raw = self.model(**inputs, output_hidden_states=True)
403
+ if focus == "Language Model":
 
 
 
 
 
 
 
404
  self.model.zero_grad()
405
  print("logits shape:", outputs_raw.logits.shape)
406
  print("start_idx:", start_idx)
 
 
 
407
 
408
+ logits = outputs_raw.logits
409
+
410
+ if target_token_idx == -1:
411
+ loss = logits.max(dim=-1).values.sum()
412
  else:
413
+ loss = logits.max(dim=-1).values[0, start_idx + target_token_idx]
414
  loss.backward()
415
  self.activations = [layer.attn_sigmoid_weights for layer in self.target_layers] if visual_method == "sigmoid" else [layer.get_attn_map() for layer in self.target_layers]
416
  self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
417
 
418
  @spaces.GPU(duration=120)
419
+ def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Language Model", accumulate_method="sum"):
420
 
421
  # Forward pass
422
  self.setup_grads()
 
435
 
436
 
437
  self.forward_backward(inputs, focus, start_idx, target_token_idx, visual_method)
 
 
 
 
 
 
438
 
439
+ cams = self.attn_guided_cam()
440
+ cam_sum_lst, grid_size = self.process_multiple_acc(cams, start_idx, images_seq_mask, accumulate_method=accumulate_method)
 
 
441
 
442
+ # cams shape: [layers, 1, seq_len, seq_len]
443
 
444
 
445