khang119966 commited on
Commit
f727381
·
verified ·
1 Parent(s): 42bde0f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -3
app.py CHANGED
@@ -450,7 +450,8 @@ tokenizer = AutoTokenizer.from_pretrained("khang119966/Vintern-1B-v3_5-explainab
450
  @spaces.GPU
451
  def generate_video(image, prompt, max_tokens):
452
  print(image)
453
- pixel_values, target_aspect_ratio = load_image(image, max_num=6).to(torch.bfloat16).cuda()
 
454
  generation_config = dict(max_new_tokens= int(max_tokens), do_sample=False, num_beams = 3, repetition_penalty=2.5)
455
  response, query = model.chat(tokenizer, pixel_values, '<image>\n'+prompt, generation_config, return_history=False, \
456
  attention_visualize=True,last_visualize_layers=7,raw_image_path=test_image,target_aspect_ratio=target_aspect_ratio)
@@ -458,6 +459,65 @@ def generate_video(image, prompt, max_tokens):
458
  generation_output = response
459
  raw_image_path = image
460
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461
  return "path_to_generated_video.mp4"
462
 
463
  with gr.Blocks() as demo:
@@ -468,8 +528,8 @@ with gr.Blocks() as demo:
468
  image = gr.Image(label="Upload your image", type = 'filepath')
469
  prompt = gr.Textbox(label="Describe your prompt", value="List all the text." )
470
  max_tokens = gr.Slider(label="Max token output (⚠️ Choose <100 for faster response)", minimum=1, maximum=512, value=50)
471
- btn = gr.Button("Attenion Video")
472
- video = gr.Video(label="Attenion Video")
473
 
474
  btn.click(fn=generate_video, inputs=[image, prompt, max_tokens], outputs=video)
475
 
 
450
  @spaces.GPU
451
  def generate_video(image, prompt, max_tokens):
452
  print(image)
453
+ pixel_values, target_aspect_ratio = load_image(image, max_num=6)
454
+ pixel_values = pixel_values.to(torch.bfloat16).cuda()
455
  generation_config = dict(max_new_tokens= int(max_tokens), do_sample=False, num_beams = 3, repetition_penalty=2.5)
456
  response, query = model.chat(tokenizer, pixel_values, '<image>\n'+prompt, generation_config, return_history=False, \
457
  attention_visualize=True,last_visualize_layers=7,raw_image_path=test_image,target_aspect_ratio=target_aspect_ratio)
 
459
  generation_output = response
460
  raw_image_path = image
461
 
462
+ attentions_tensors = []
463
+ for tok_ in generation_output["attentions"]:
464
+ attentions_tensors.append([])
465
+ for lay_ in tok_ :
466
+ attentions_tensors[-1].append(lay_.detach().cpu().type(torch.float).numpy())
467
+ attention_scores = attentions_tensors
468
+ query_ = tokenizer(query)
469
+ start_img_token_index = int(np.where(np.array(query_["input_ids"])==tokenizer("<img>")["input_ids"][0])[0]+1)
470
+ end_img_token_index = int(np.where(np.array(query_["input_ids"])==tokenizer("</img>")["input_ids"][0])[0]-256)
471
+ if end_img_token_index - start_img_token_index == 0 :
472
+ end_img_token_index = int(np.where(np.array(query_["input_ids"])==tokenizer("</img>")["input_ids"][0])[0])
473
+
474
+ # Đọc ảnh gốc
475
+ image = cv2.imread(raw_image_path)
476
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
477
+ # Resize ảnh nhỏ hơn để giảm dung lượng GIF
478
+ scale_factor = 1. # Giảm 50% kích thước
479
+ alpha = 0.4
480
+ # Lưu danh sách frames GIF
481
+ visualization_frames = []
482
+ # Chuỗi sinh ra
483
+ generated_text = ""
484
+ frame_step = 1
485
+ input_token = ""
486
+
487
+ params_for_text = []
488
+ params_for_hidden = []
489
+ heatmap_imgs = []
490
+ top_visual_tokens_focus_tables = []
491
+ # Lặp qua từng token
492
+ for index_focus in tqdm.tqdm(range(0, generation_output.sequences.shape[1], frame_step)):
493
+ predict_token_text = tokenizer.decode(generation_output.sequences[0, index_focus])
494
+ generated_text += predict_token_text # Ghép chữ lại
495
+ # Tạo heatmap trung bình từ các lớp attention
496
+ heat_maps, top_visual_tokens_focus = visualize_attention_hiddenstate(attention_scores[index_focus], head=None,
497
+ start_img_token_index=start_img_token_index, end_img_token_index=end_img_token_index,
498
+ target_aspect_ratio=target_aspect_ratio)
499
+
500
+ heatmap = np.array(heat_maps[0])
501
+ # Resize heatmap về kích thước ảnh gốc
502
+ heatmap = cv2.resize(heatmap, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_CUBIC)
503
+ # Làm mượt heatmap
504
+ heatmap_smooth = gaussian_filter(heatmap, sigma=1)
505
+ # Chuẩn hóa heatmap về 0-255
506
+ heatmap_norm = cv2.normalize(heatmap_smooth, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
507
+ heatmap_color = cv2.applyColorMap(heatmap_norm, cv2.COLORMAP_JET)
508
+ heatmap_color = cv2.cvtColor(heatmap_color, cv2.COLOR_BGR2RGB)
509
+ # Overlay ảnh heatmap lên ảnh gốc
510
+ overlay = cv2.addWeighted(image, 1 - alpha, heatmap_color, alpha, 0)
511
+
512
+ prev_text = generated_text[:-len(input_token)-len(predict_token_text)] + " "
513
+ params_for_text.append((prev_text, input_token, predict_token_text))
514
+
515
+ hidden_tabel = extract_next_token_table_data(model, tokenizer, generation_output, index_focus)
516
+ params_for_hidden.append((hidden_tabel,predict_token_text))
517
+
518
+ input_token = predict_token_text
519
+ heatmap_imgs.append(overlay)
520
+
521
  return "path_to_generated_video.mp4"
522
 
523
  with gr.Blocks() as demo:
 
528
  image = gr.Image(label="Upload your image", type = 'filepath')
529
  prompt = gr.Textbox(label="Describe your prompt", value="List all the text." )
530
  max_tokens = gr.Slider(label="Max token output (⚠️ Choose <100 for faster response)", minimum=1, maximum=512, value=50)
531
+ btn = gr.Button("Inference")
532
+ video = gr.Video(label="Visualization Video")
533
 
534
  btn.click(fn=generate_video, inputs=[image, prompt, max_tokens], outputs=video)
535