Explainable-Vision-Language-Model

Running on Zero

App Files Files Community

khang119966 commited on 21 days ago

Commit

c3d33a4

verified ·

1 Parent(s): 32c2424

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -4

app.py CHANGED Viewed

@@ -285,13 +285,13 @@ def visualize_attention_hiddenstate(attention_tensor, head=None, start_img_token
     averaged_layer = np.mean(last_8_layers,axis=0)  # Trung bình 8 layer cuối
     if head is None:
-        averaged_attention = averaged_layer.mean(axis=1).squeeze()  # Trung bình qua các head
     else:
-        averaged_attention = averaged_layer[:, head, :, :].squeeze()  # Chọn head cụ thể
     heat_maps = []
     top_5_tokens = []
     for i in range(len(averaged_attention)):  # Duyệt qua các beam
         h_target_aspect_ratio = target_aspect_ratio[1] if target_aspect_ratio[1] != 0 else 1
         w_target_aspect_ratio = target_aspect_ratio[0] if target_aspect_ratio[0] != 0 else 1
@@ -306,11 +306,13 @@ def visualize_attention_hiddenstate(attention_tensor, head=None, start_img_token
         # Reshape lại attention để vẽ heatmap
         img_atten_score = img_atten_score.reshape(h_target_aspect_ratio, w_target_aspect_ratio, 16, 16)
         img_atten_score = np.transpose(img_atten_score, (0, 2, 1, 3)).reshape(h_target_aspect_ratio * 16, w_target_aspect_ratio * 16)
         img_atten_score = np.power(img_atten_score, 0.9)
-        heat_maps.append(img_atten_score)
     return heat_maps, top_5_tokens
@@ -379,6 +381,21 @@ def generate_video(image, prompt, max_tokens):
     response, query = model.chat(tokenizer, pixel_values, '<image>\n'+prompt, generation_config, return_history=False, \
                             attention_visualize=True,last_visualize_layers=7,raw_image_path=image,target_aspect_ratio=target_aspect_ratio)
     generation_output = response
     raw_image_path = image

     averaged_layer = np.mean(last_8_layers,axis=0)  # Trung bình 8 layer cuối
     if head is None:
+        averaged_attention = averaged_layer.mean(axis=1)  # Trung bình qua các head
     else:
+        averaged_attention = averaged_layer[:, head, :, :]  # Chọn head cụ thể
     heat_maps = []
     top_5_tokens = []
     for i in range(len(averaged_attention)):  # Duyệt qua các beam
         h_target_aspect_ratio = target_aspect_ratio[1] if target_aspect_ratio[1] != 0 else 1
         w_target_aspect_ratio = target_aspect_ratio[0] if target_aspect_ratio[0] != 0 else 1
         # Reshape lại attention để vẽ heatmap
         img_atten_score = img_atten_score.reshape(h_target_aspect_ratio, w_target_aspect_ratio, 16, 16)
         img_atten_score = np.transpose(img_atten_score, (0, 2, 1, 3)).reshape(h_target_aspect_ratio * 16, w_target_aspect_ratio * 16)
         img_atten_score = np.power(img_atten_score, 0.9)
+    heat_maps.append(img_atten_score)
     return heat_maps, top_5_tokens
     response, query = model.chat(tokenizer, pixel_values, '<image>\n'+prompt, generation_config, return_history=False, \
                             attention_visualize=True,last_visualize_layers=7,raw_image_path=image,target_aspect_ratio=target_aspect_ratio)
+    ###### GET GOOD BEAM #####
+    response_attentions_list = []
+    response_hidden_states_list = []
+    for index in range(len(response.beam_indices[0])):
+        beam_indice = response.beam_indices[0][index]
+        layer_response_attentions_list = []
+        layer_response_hidden_states_list = []
+        for layer_index in range(len(response.attentions[index])):
+            layer_response_attentions_list.append(torch.unsqueeze(response.attentions[index][layer_index][beam_indice],0))
+            layer_response_hidden_states_list.append(torch.unsqueeze(response.hidden_states[index][layer_index][beam_indice],0))
+        response_attentions_list.append(layer_response_attentions_list)
+        response_hidden_states_list.append(layer_response_hidden_states_list)
+    response.attentions = response_attentions_list
+    response.hidden_states = response_hidden_states_list
     generation_output = response
     raw_image_path = image