Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -28,6 +28,8 @@ import spaces
|
|
28 |
import subprocess
|
29 |
import os
|
30 |
from moviepy.editor import VideoFileClip, AudioFileClip
|
|
|
|
|
31 |
|
32 |
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
33 |
|
@@ -446,7 +448,14 @@ model = AutoModel.from_pretrained(
|
|
446 |
trust_remote_code=True,
|
447 |
).eval().cuda()
|
448 |
tokenizer = AutoTokenizer.from_pretrained("khang119966/Vintern-1B-v3_5-explainableAI", trust_remote_code=True, use_fast=False)
|
449 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
450 |
@spaces.GPU
|
451 |
def generate_video(image, prompt, max_tokens):
|
452 |
print(image)
|
@@ -517,11 +526,52 @@ def generate_video(image, prompt, max_tokens):
|
|
517 |
|
518 |
input_token = predict_token_text
|
519 |
heatmap_imgs.append(overlay)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
520 |
|
521 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
522 |
|
523 |
with gr.Blocks() as demo:
|
524 |
-
gr.Markdown("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
525 |
|
526 |
with gr.Row():
|
527 |
with gr.Column():
|
|
|
28 |
import subprocess
|
29 |
import os
|
30 |
from moviepy.editor import VideoFileClip, AudioFileClip
|
31 |
+
import multiprocessing
|
32 |
+
import imageio
|
33 |
|
34 |
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
35 |
|
|
|
448 |
trust_remote_code=True,
|
449 |
).eval().cuda()
|
450 |
tokenizer = AutoTokenizer.from_pretrained("khang119966/Vintern-1B-v3_5-explainableAI", trust_remote_code=True, use_fast=False)
|
451 |
+
|
452 |
+
# Hàm bao để truyền vào multiprocessing
|
453 |
+
def generate_text_img_wrapper(args):
|
454 |
+
return generate_text_image_with_html2image(*args, image_width=500, min_height=1000)
|
455 |
+
|
456 |
+
def generate_hidden_img_wrapper(args):
|
457 |
+
return render_next_token_table_image(*args)
|
458 |
+
|
459 |
@spaces.GPU
|
460 |
def generate_video(image, prompt, max_tokens):
|
461 |
print(image)
|
|
|
526 |
|
527 |
input_token = predict_token_text
|
528 |
heatmap_imgs.append(overlay)
|
529 |
+
|
530 |
+
# Dùng multiprocessing
|
531 |
+
with multiprocessing.Pool(processes=20) as pool:
|
532 |
+
text_imgs = pool.map(generate_text_img_wrapper, params_for_text)
|
533 |
+
hidden_imgs = pool.map(generate_hidden_img_wrapper, params_for_hidden)
|
534 |
+
|
535 |
+
for i in range(len(text_imgs)):
|
536 |
+
overlay = heatmap_imgs[i]
|
537 |
+
text_img = text_imgs[i]
|
538 |
+
predict_hidden_states = hidden_imgs[i]
|
539 |
+
overlay_adjusted = adjust_overlay(overlay, text_img)
|
540 |
+
predict_hidden_states = adjust_overlay(predict_hidden_states, text_img)
|
541 |
+
combined_image = np.hstack((overlay_adjusted, text_img, predict_hidden_states))
|
542 |
+
visualization_frames.append(combined_image)
|
543 |
|
544 |
+
resized_visualization_frames = []
|
545 |
+
for frame in visualization_frames:
|
546 |
+
frame = cv2.resize(frame,(visualization_frames[0].shape[1],visualization_frames[0].shape[0]))
|
547 |
+
resized_visualization_frames.append(frame)
|
548 |
+
|
549 |
+
# Lưu thành video MP4 bằng imageio
|
550 |
+
imageio.mimsave(
|
551 |
+
'heatmap_animation.mp4',
|
552 |
+
resized_visualization_frames, # dạng RGB
|
553 |
+
fps=5
|
554 |
+
)
|
555 |
+
|
556 |
+
return "heatmap_animation.mp4"
|
557 |
|
558 |
with gr.Blocks() as demo:
|
559 |
+
gr.Markdown("""## 🎥 Visualizing How Multimodal Models Think
|
560 |
+
This tool generates a video to **visualize how a multimodal model (image + text)** attends to different parts of an image while generating text.
|
561 |
+
### 📌 What it does:
|
562 |
+
- Takes an input image and a text prompt.
|
563 |
+
- Shows how the model’s attention shifts on the image for each generated token.
|
564 |
+
- Helps explain the model’s behavior and decision-making.
|
565 |
+
### 🖼️ Video layout (per frame):
|
566 |
+
Each frame in the video includes:
|
567 |
+
1. 🔥 **Heatmap over image**: Shows which area the model focuses on.
|
568 |
+
2. 📝 **Generated text**: With old context, current token highlighted.
|
569 |
+
3. 📊 **Token prediction table**: Shows the model’s top next-token guesses.
|
570 |
+
### 🎯 Use cases:
|
571 |
+
- Research explainability of vision-language models.
|
572 |
+
- Debugging or interpreting model outputs.
|
573 |
+
- Creating educational visualizations.
|
574 |
+
""")
|
575 |
|
576 |
with gr.Row():
|
577 |
with gr.Column():
|