khang119966 commited on
Commit
60b0804
·
verified ·
1 Parent(s): 1c43489

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -1
app.py CHANGED
@@ -490,9 +490,11 @@ def generate_video(image, prompt, max_tokens):
490
  with gr.Blocks() as demo:
491
  gr.Markdown("""# 🎥 Visualizing How Multimodal Models Think
492
  - This tool generates a video to **visualize how a multimodal model (image + text)** attends to different parts of an image while generating text.
 
493
  📌 What it does: - Takes an input image and a text prompt. - Shows how the model’s attention shifts on the image for each generated token. - Helps explain the model’s behavior and decision-making.
 
494
  🖼️ Video layout (per frame): Each frame in the video includes: 1. 🔥 **Heatmap over image**: Shows which area the model focuses on. 2. 📝 **Generated text**: With old context, current token highlighted. 3. 📊 **Token prediction table**: Shows the model’s top next-token guesses.
495
- 🎯 Use cases: Research explainability of vision-language models. - Debugging or interpreting model outputs. - Creating educational visualizations.
496
  """)
497
 
498
  with gr.Row():
 
490
  with gr.Blocks() as demo:
491
  gr.Markdown("""# 🎥 Visualizing How Multimodal Models Think
492
  - This tool generates a video to **visualize how a multimodal model (image + text)** attends to different parts of an image while generating text.
493
+
494
  📌 What it does: - Takes an input image and a text prompt. - Shows how the model’s attention shifts on the image for each generated token. - Helps explain the model’s behavior and decision-making.
495
+
496
  🖼️ Video layout (per frame): Each frame in the video includes: 1. 🔥 **Heatmap over image**: Shows which area the model focuses on. 2. 📝 **Generated text**: With old context, current token highlighted. 3. 📊 **Token prediction table**: Shows the model’s top next-token guesses.
497
+
498
  """)
499
 
500
  with gr.Row():