khang119966 commited on
Commit
20fac9c
·
verified ·
1 Parent(s): 3d50453

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -8
app.py CHANGED
@@ -30,6 +30,8 @@ import os
30
  from moviepy.editor import VideoFileClip, AudioFileClip
31
  import multiprocessing
32
  import imageio
 
 
33
 
34
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
35
 
@@ -557,14 +559,11 @@ def generate_video(image, prompt, max_tokens):
557
  return "heatmap_animation.mp4"
558
 
559
  with gr.Blocks() as demo:
560
- gr.Markdown("""## 🎥 Visualizing How Multimodal Models Think
561
- This tool generates a video to **visualize how a multimodal model (image + text)** attends to different parts of an image while generating text.
562
- ### 📌 What it does:
563
- - Takes an input image and a text prompt. - Shows how the model’s attention shifts on the image for each generated token. - Helps explain the model’s behavior and decision-making.
564
- ### 🖼️ Video layout (per frame):
565
- Each frame in the video includes: 1. 🔥 **Heatmap over image**: Shows which area the model focuses on. 2. 📝 **Generated text**: With old context, current token highlighted. 3. 📊 **Token prediction table**: Shows the model’s top next-token guesses.
566
- ### 🎯 Use cases:
567
- - Research explainability of vision-language models. - Debugging or interpreting model outputs. - Creating educational visualizations.
568
  """)
569
 
570
  with gr.Row():
 
30
  from moviepy.editor import VideoFileClip, AudioFileClip
31
  import multiprocessing
32
  import imageio
33
+ import tqdm
34
+
35
 
36
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
37
 
 
559
  return "heatmap_animation.mp4"
560
 
561
  with gr.Blocks() as demo:
562
+ gr.Markdown("""# 🎥 Visualizing How Multimodal Models Think
563
+ - This tool generates a video to **visualize how a multimodal model (image + text)** attends to different parts of an image while generating text.
564
+ 📌 What it does: - Takes an input image and a text prompt. - Shows how the model’s attention shifts on the image for each generated token. - Helps explain the model’s behavior and decision-making.
565
+ 🖼️ Video layout (per frame): Each frame in the video includes: 1. 🔥 **Heatmap over image**: Shows which area the model focuses on. 2. 📝 **Generated text**: With old context, current token highlighted. 3. 📊 **Token prediction table**: Shows the model’s top next-token guesses.
566
+ 🎯 Use cases: Research explainability of vision-language models. - Debugging or interpreting model outputs. - Creating educational visualizations.
 
 
 
567
  """)
568
 
569
  with gr.Row():