Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -450,7 +450,8 @@ tokenizer = AutoTokenizer.from_pretrained("khang119966/Vintern-1B-v3_5-explainab
|
|
450 |
@spaces.GPU
|
451 |
def generate_video(image, prompt, max_tokens):
|
452 |
print(image)
|
453 |
-
pixel_values, target_aspect_ratio = load_image(image, max_num=6)
|
|
|
454 |
generation_config = dict(max_new_tokens= int(max_tokens), do_sample=False, num_beams = 3, repetition_penalty=2.5)
|
455 |
response, query = model.chat(tokenizer, pixel_values, '<image>\n'+prompt, generation_config, return_history=False, \
|
456 |
attention_visualize=True,last_visualize_layers=7,raw_image_path=test_image,target_aspect_ratio=target_aspect_ratio)
|
@@ -458,6 +459,65 @@ def generate_video(image, prompt, max_tokens):
|
|
458 |
generation_output = response
|
459 |
raw_image_path = image
|
460 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
461 |
return "path_to_generated_video.mp4"
|
462 |
|
463 |
with gr.Blocks() as demo:
|
@@ -468,8 +528,8 @@ with gr.Blocks() as demo:
|
|
468 |
image = gr.Image(label="Upload your image", type = 'filepath')
|
469 |
prompt = gr.Textbox(label="Describe your prompt", value="List all the text." )
|
470 |
max_tokens = gr.Slider(label="Max token output (⚠️ Choose <100 for faster response)", minimum=1, maximum=512, value=50)
|
471 |
-
btn = gr.Button("
|
472 |
-
video = gr.Video(label="
|
473 |
|
474 |
btn.click(fn=generate_video, inputs=[image, prompt, max_tokens], outputs=video)
|
475 |
|
|
|
450 |
@spaces.GPU
|
451 |
def generate_video(image, prompt, max_tokens):
|
452 |
print(image)
|
453 |
+
pixel_values, target_aspect_ratio = load_image(image, max_num=6)
|
454 |
+
pixel_values = pixel_values.to(torch.bfloat16).cuda()
|
455 |
generation_config = dict(max_new_tokens= int(max_tokens), do_sample=False, num_beams = 3, repetition_penalty=2.5)
|
456 |
response, query = model.chat(tokenizer, pixel_values, '<image>\n'+prompt, generation_config, return_history=False, \
|
457 |
attention_visualize=True,last_visualize_layers=7,raw_image_path=test_image,target_aspect_ratio=target_aspect_ratio)
|
|
|
459 |
generation_output = response
|
460 |
raw_image_path = image
|
461 |
|
462 |
+
attentions_tensors = []
|
463 |
+
for tok_ in generation_output["attentions"]:
|
464 |
+
attentions_tensors.append([])
|
465 |
+
for lay_ in tok_ :
|
466 |
+
attentions_tensors[-1].append(lay_.detach().cpu().type(torch.float).numpy())
|
467 |
+
attention_scores = attentions_tensors
|
468 |
+
query_ = tokenizer(query)
|
469 |
+
start_img_token_index = int(np.where(np.array(query_["input_ids"])==tokenizer("<img>")["input_ids"][0])[0]+1)
|
470 |
+
end_img_token_index = int(np.where(np.array(query_["input_ids"])==tokenizer("</img>")["input_ids"][0])[0]-256)
|
471 |
+
if end_img_token_index - start_img_token_index == 0 :
|
472 |
+
end_img_token_index = int(np.where(np.array(query_["input_ids"])==tokenizer("</img>")["input_ids"][0])[0])
|
473 |
+
|
474 |
+
# Đọc ảnh gốc
|
475 |
+
image = cv2.imread(raw_image_path)
|
476 |
+
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
477 |
+
# Resize ảnh nhỏ hơn để giảm dung lượng GIF
|
478 |
+
scale_factor = 1. # Giảm 50% kích thước
|
479 |
+
alpha = 0.4
|
480 |
+
# Lưu danh sách frames GIF
|
481 |
+
visualization_frames = []
|
482 |
+
# Chuỗi sinh ra
|
483 |
+
generated_text = ""
|
484 |
+
frame_step = 1
|
485 |
+
input_token = ""
|
486 |
+
|
487 |
+
params_for_text = []
|
488 |
+
params_for_hidden = []
|
489 |
+
heatmap_imgs = []
|
490 |
+
top_visual_tokens_focus_tables = []
|
491 |
+
# Lặp qua từng token
|
492 |
+
for index_focus in tqdm.tqdm(range(0, generation_output.sequences.shape[1], frame_step)):
|
493 |
+
predict_token_text = tokenizer.decode(generation_output.sequences[0, index_focus])
|
494 |
+
generated_text += predict_token_text # Ghép chữ lại
|
495 |
+
# Tạo heatmap trung bình từ các lớp attention
|
496 |
+
heat_maps, top_visual_tokens_focus = visualize_attention_hiddenstate(attention_scores[index_focus], head=None,
|
497 |
+
start_img_token_index=start_img_token_index, end_img_token_index=end_img_token_index,
|
498 |
+
target_aspect_ratio=target_aspect_ratio)
|
499 |
+
|
500 |
+
heatmap = np.array(heat_maps[0])
|
501 |
+
# Resize heatmap về kích thước ảnh gốc
|
502 |
+
heatmap = cv2.resize(heatmap, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_CUBIC)
|
503 |
+
# Làm mượt heatmap
|
504 |
+
heatmap_smooth = gaussian_filter(heatmap, sigma=1)
|
505 |
+
# Chuẩn hóa heatmap về 0-255
|
506 |
+
heatmap_norm = cv2.normalize(heatmap_smooth, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
|
507 |
+
heatmap_color = cv2.applyColorMap(heatmap_norm, cv2.COLORMAP_JET)
|
508 |
+
heatmap_color = cv2.cvtColor(heatmap_color, cv2.COLOR_BGR2RGB)
|
509 |
+
# Overlay ảnh heatmap lên ảnh gốc
|
510 |
+
overlay = cv2.addWeighted(image, 1 - alpha, heatmap_color, alpha, 0)
|
511 |
+
|
512 |
+
prev_text = generated_text[:-len(input_token)-len(predict_token_text)] + " "
|
513 |
+
params_for_text.append((prev_text, input_token, predict_token_text))
|
514 |
+
|
515 |
+
hidden_tabel = extract_next_token_table_data(model, tokenizer, generation_output, index_focus)
|
516 |
+
params_for_hidden.append((hidden_tabel,predict_token_text))
|
517 |
+
|
518 |
+
input_token = predict_token_text
|
519 |
+
heatmap_imgs.append(overlay)
|
520 |
+
|
521 |
return "path_to_generated_video.mp4"
|
522 |
|
523 |
with gr.Blocks() as demo:
|
|
|
528 |
image = gr.Image(label="Upload your image", type = 'filepath')
|
529 |
prompt = gr.Textbox(label="Describe your prompt", value="List all the text." )
|
530 |
max_tokens = gr.Slider(label="Max token output (⚠️ Choose <100 for faster response)", minimum=1, maximum=512, value=50)
|
531 |
+
btn = gr.Button("Inference")
|
532 |
+
video = gr.Video(label="Visualization Video")
|
533 |
|
534 |
btn.click(fn=generate_video, inputs=[image, prompt, max_tokens], outputs=video)
|
535 |
|