AustingDong
commited on
Commit
·
63b5fc2
1
Parent(s):
9cae7ef
align
Browse files- app.py +3 -3
- demo/visualization.py +27 -57
app.py
CHANGED
@@ -258,7 +258,7 @@ with gr.Blocks() as demo:
|
|
258 |
activation_map_output = gr.Gallery(label="Visualization", height=500, columns=1, preview=True)
|
259 |
|
260 |
with gr.Row():
|
261 |
-
|
262 |
understanding_output = gr.Textbox(label="Answer")
|
263 |
|
264 |
with gr.Row():
|
@@ -266,7 +266,7 @@ with gr.Blocks() as demo:
|
|
266 |
with gr.Column():
|
267 |
model_selector = gr.Dropdown(choices=["ChartGemma-3B", "Janus-Pro-1B", "Janus-Pro-7B", "LLaVA-1.5-7B"], value="ChartGemma-3B", label="model")
|
268 |
test_selector = gr.Dropdown(choices=["mini-VLAT", "VLAT", "VLAT-old"], value="mini-VLAT", label="test")
|
269 |
-
|
270 |
und_seed_input = gr.Number(label="Seed", precision=0, value=42)
|
271 |
top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")
|
272 |
temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="temperature")
|
@@ -275,7 +275,7 @@ with gr.Blocks() as demo:
|
|
275 |
|
276 |
with gr.Column():
|
277 |
response_type = gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="answer + visualization", label="response_type")
|
278 |
-
focus = gr.Dropdown(choices=["question", "question + answer"], value="question
|
279 |
activation_map_method = gr.Dropdown(choices=["AG-CAM"], value="AG-CAM", label="visualization type")
|
280 |
accumulate_method = gr.Dropdown(choices=["sum", "mult"], value="sum", label="layers accumulate method")
|
281 |
visual_method = gr.Dropdown(choices=["softmax", "sigmoid"], value="softmax", label="activation function")
|
|
|
258 |
activation_map_output = gr.Gallery(label="Visualization", height=500, columns=1, preview=True)
|
259 |
|
260 |
with gr.Row():
|
261 |
+
question_input = gr.Textbox(label="Question")
|
262 |
understanding_output = gr.Textbox(label="Answer")
|
263 |
|
264 |
with gr.Row():
|
|
|
266 |
with gr.Column():
|
267 |
model_selector = gr.Dropdown(choices=["ChartGemma-3B", "Janus-Pro-1B", "Janus-Pro-7B", "LLaVA-1.5-7B"], value="ChartGemma-3B", label="model")
|
268 |
test_selector = gr.Dropdown(choices=["mini-VLAT", "VLAT", "VLAT-old"], value="mini-VLAT", label="test")
|
269 |
+
chart_type = gr.Textbox(label="Chart Type", value="Any")
|
270 |
und_seed_input = gr.Number(label="Seed", precision=0, value=42)
|
271 |
top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")
|
272 |
temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="temperature")
|
|
|
275 |
|
276 |
with gr.Column():
|
277 |
response_type = gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="answer + visualization", label="response_type")
|
278 |
+
focus = gr.Dropdown(choices=["question", "question + answer"], value="question", label="focus")
|
279 |
activation_map_method = gr.Dropdown(choices=["AG-CAM"], value="AG-CAM", label="visualization type")
|
280 |
accumulate_method = gr.Dropdown(choices=["sum", "mult"], value="sum", label="layers accumulate method")
|
281 |
visual_method = gr.Dropdown(choices=["softmax", "sigmoid"], value="softmax", label="activation function")
|
demo/visualization.py
CHANGED
@@ -296,7 +296,7 @@ class VisualizationJanus(Visualization):
|
|
296 |
self._modify_layers()
|
297 |
self._register_hooks_activations()
|
298 |
|
299 |
-
def forward_backward(self, input_tensor, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="
|
300 |
# Forward
|
301 |
image_embeddings, inputs_embeddings, outputs = self.model(input_tensor, tokenizer, temperature, top_p)
|
302 |
print(input_tensor.keys())
|
@@ -304,24 +304,18 @@ class VisualizationJanus(Visualization):
|
|
304 |
start_idx = 620
|
305 |
self.model.zero_grad()
|
306 |
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
|
308 |
-
|
309 |
-
|
310 |
-
loss = outputs.logits.max(dim=-1).values[0, start_idx + target_token_idx]
|
311 |
-
loss.backward()
|
312 |
-
|
313 |
-
elif focus == "Language Model":
|
314 |
-
if target_token_idx == -1:
|
315 |
-
loss = outputs.logits.max(dim=-1).values.sum()
|
316 |
-
else:
|
317 |
-
loss = outputs.logits.max(dim=-1).values[0, start_idx + target_token_idx]
|
318 |
-
loss.backward()
|
319 |
-
|
320 |
-
self.activations = self.activations = [layer.attn_sigmoid_weights for layer in self.target_layers] if visual_method == "sigmoid" else [layer.get_attn_map() for layer in self.target_layers]
|
321 |
-
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
322 |
|
323 |
@spaces.GPU(duration=120)
|
324 |
-
def generate_cam(self, input_tensor, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="
|
325 |
|
326 |
self.setup_grads()
|
327 |
|
@@ -329,25 +323,14 @@ class VisualizationJanus(Visualization):
|
|
329 |
self.forward_backward(input_tensor, tokenizer, temperature, top_p, target_token_idx, visual_method, focus)
|
330 |
|
331 |
start_idx = 620
|
332 |
-
if focus == "Visual Encoder":
|
333 |
-
|
334 |
-
cam_sum = self.grad_cam_vis()
|
335 |
-
cam_sum, grid_size = self.process(cam_sum)
|
336 |
-
return cam_sum, grid_size, start_idx
|
337 |
-
|
338 |
-
elif focus == "Language Model":
|
339 |
-
|
340 |
-
# cam_sum = self.grad_cam_llm(mean_inside=True)
|
341 |
|
342 |
-
|
343 |
-
|
344 |
-
# cam_sum_lst, grid_size = self.process_multiple(cam_sum, start_idx, images_seq_mask)
|
345 |
|
346 |
-
|
347 |
-
|
348 |
|
349 |
|
350 |
-
|
351 |
|
352 |
|
353 |
|
@@ -371,13 +354,14 @@ class VisualizationLLaVA(Visualization):
|
|
371 |
self.model.zero_grad()
|
372 |
print("outputs_raw", outputs_raw)
|
373 |
|
374 |
-
|
|
|
375 |
loss.backward()
|
376 |
self.activations = [layer.get_attn_map() for layer in self.target_layers]
|
377 |
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
378 |
|
379 |
@spaces.GPU(duration=120)
|
380 |
-
def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="
|
381 |
|
382 |
self.setup_grads()
|
383 |
self.forward_backward(inputs)
|
@@ -416,29 +400,23 @@ class VisualizationChartGemma(Visualization):
|
|
416 |
|
417 |
def forward_backward(self, inputs, focus, start_idx, target_token_idx, visual_method="softmax"):
|
418 |
outputs_raw = self.model(**inputs, output_hidden_states=True)
|
419 |
-
if focus == "
|
420 |
-
|
421 |
-
self.model.zero_grad()
|
422 |
-
|
423 |
-
loss = outputs_raw.logits.max(dim=-1).values[0, start_idx + target_token_idx]
|
424 |
-
loss.backward()
|
425 |
-
|
426 |
-
elif focus == "Language Model":
|
427 |
self.model.zero_grad()
|
428 |
print("logits shape:", outputs_raw.logits.shape)
|
429 |
print("start_idx:", start_idx)
|
430 |
-
if target_token_idx == -1:
|
431 |
-
logits_prob = F.softmax(outputs_raw.logits, dim=-1)
|
432 |
-
loss = logits_prob.max(dim=-1).values.sum()
|
433 |
|
|
|
|
|
|
|
|
|
434 |
else:
|
435 |
-
loss =
|
436 |
loss.backward()
|
437 |
self.activations = [layer.attn_sigmoid_weights for layer in self.target_layers] if visual_method == "sigmoid" else [layer.get_attn_map() for layer in self.target_layers]
|
438 |
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
439 |
|
440 |
@spaces.GPU(duration=120)
|
441 |
-
def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="
|
442 |
|
443 |
# Forward pass
|
444 |
self.setup_grads()
|
@@ -457,19 +435,11 @@ class VisualizationChartGemma(Visualization):
|
|
457 |
|
458 |
|
459 |
self.forward_backward(inputs, focus, start_idx, target_token_idx, visual_method)
|
460 |
-
if focus == "Visual Encoder":
|
461 |
-
|
462 |
-
cam_sum = self.grad_cam_vis()
|
463 |
-
cam_sum, grid_size = self.process(cam_sum, remove_cls=False)
|
464 |
-
|
465 |
-
return cam_sum, grid_size, start_idx
|
466 |
|
467 |
-
|
468 |
-
|
469 |
-
cams = self.attn_guided_cam()
|
470 |
-
cam_sum_lst, grid_size = self.process_multiple_acc(cams, start_idx, images_seq_mask, accumulate_method=accumulate_method)
|
471 |
|
472 |
-
|
473 |
|
474 |
|
475 |
|
|
|
296 |
self._modify_layers()
|
297 |
self._register_hooks_activations()
|
298 |
|
299 |
+
def forward_backward(self, input_tensor, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Language Model"):
|
300 |
# Forward
|
301 |
image_embeddings, inputs_embeddings, outputs = self.model(input_tensor, tokenizer, temperature, top_p)
|
302 |
print(input_tensor.keys())
|
|
|
304 |
start_idx = 620
|
305 |
self.model.zero_grad()
|
306 |
|
307 |
+
logits = outputs.logits
|
308 |
+
if target_token_idx == -1:
|
309 |
+
loss = logits.max(dim=-1).values.sum()
|
310 |
+
else:
|
311 |
+
loss = logits.max(dim=-1).values[0, start_idx + target_token_idx]
|
312 |
+
loss.backward()
|
313 |
|
314 |
+
self.activations = self.activations = [layer.attn_sigmoid_weights for layer in self.target_layers] if visual_method == "sigmoid" else [layer.get_attn_map() for layer in self.target_layers]
|
315 |
+
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
|
317 |
@spaces.GPU(duration=120)
|
318 |
+
def generate_cam(self, input_tensor, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Language Model", accumulate_method="sum"):
|
319 |
|
320 |
self.setup_grads()
|
321 |
|
|
|
323 |
self.forward_backward(input_tensor, tokenizer, temperature, top_p, target_token_idx, visual_method, focus)
|
324 |
|
325 |
start_idx = 620
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
|
327 |
+
images_seq_mask = input_tensor.images_seq_mask[0].detach().cpu().tolist()
|
|
|
|
|
328 |
|
329 |
+
cams = self.attn_guided_cam()
|
330 |
+
cam_sum_lst, grid_size = self.process_multiple_acc(cams, start_idx, images_seq_mask, accumulate_method=accumulate_method)
|
331 |
|
332 |
|
333 |
+
return cam_sum_lst, grid_size, start_idx
|
334 |
|
335 |
|
336 |
|
|
|
354 |
self.model.zero_grad()
|
355 |
print("outputs_raw", outputs_raw)
|
356 |
|
357 |
+
logits = outputs_raw.logits
|
358 |
+
loss = logits.max(dim=-1).values.sum()
|
359 |
loss.backward()
|
360 |
self.activations = [layer.get_attn_map() for layer in self.target_layers]
|
361 |
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
362 |
|
363 |
@spaces.GPU(duration=120)
|
364 |
+
def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Language Model", accumulate_method="sum"):
|
365 |
|
366 |
self.setup_grads()
|
367 |
self.forward_backward(inputs)
|
|
|
400 |
|
401 |
def forward_backward(self, inputs, focus, start_idx, target_token_idx, visual_method="softmax"):
|
402 |
outputs_raw = self.model(**inputs, output_hidden_states=True)
|
403 |
+
if focus == "Language Model":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
404 |
self.model.zero_grad()
|
405 |
print("logits shape:", outputs_raw.logits.shape)
|
406 |
print("start_idx:", start_idx)
|
|
|
|
|
|
|
407 |
|
408 |
+
logits = outputs_raw.logits
|
409 |
+
|
410 |
+
if target_token_idx == -1:
|
411 |
+
loss = logits.max(dim=-1).values.sum()
|
412 |
else:
|
413 |
+
loss = logits.max(dim=-1).values[0, start_idx + target_token_idx]
|
414 |
loss.backward()
|
415 |
self.activations = [layer.attn_sigmoid_weights for layer in self.target_layers] if visual_method == "sigmoid" else [layer.get_attn_map() for layer in self.target_layers]
|
416 |
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
417 |
|
418 |
@spaces.GPU(duration=120)
|
419 |
+
def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Language Model", accumulate_method="sum"):
|
420 |
|
421 |
# Forward pass
|
422 |
self.setup_grads()
|
|
|
435 |
|
436 |
|
437 |
self.forward_backward(inputs, focus, start_idx, target_token_idx, visual_method)
|
|
|
|
|
|
|
|
|
|
|
|
|
438 |
|
439 |
+
cams = self.attn_guided_cam()
|
440 |
+
cam_sum_lst, grid_size = self.process_multiple_acc(cams, start_idx, images_seq_mask, accumulate_method=accumulate_method)
|
|
|
|
|
441 |
|
442 |
+
# cams shape: [layers, 1, seq_len, seq_len]
|
443 |
|
444 |
|
445 |
|