Spaces:

multimodalart
/

Dream

Running on Zero

App Files Files Community

multimodalart HF Staff commited on 29 days ago

Commit

ce90309

verified ·

1 Parent(s): 47fc4a0

Update app.py

Browse files

Files changed (1) hide show

app.py +435 -649

app.py CHANGED Viewed

@@ -1,96 +1,62 @@
 import torch
-# import numpy as np # Not strictly needed anymore
 import gradio as gr
 import spaces
-from transformers import AutoTokenizer, AutoModel
 import time
-import re # Keep for parsing constraints
-# Use try-except for space deployment vs local
 try:
-    # Used for spaces deployment with GPU
-    gpu_check = spaces.GPU
-    print("Running in Gradio Spaces with GPU environment.")
 except AttributeError:
-    # Fallback for local execution or environments without spaces.GPU
-    print("Running in local environment or without spaces.GPU.")
-    # Define a dummy decorator if spaces.GPU is not available
-    def gpu_check(func):
-        return func
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-print(f"Using device: {device}")
-# --- Load DREAM Model and Tokenizer ---
-# Ensure sufficient VRAM, Dream 7B needs ~16GB+ VRAM in bfloat16
-model_path = "Dream-org/Dream-v0-Instruct-7B"
-print(f"Loading model: {model_path}...")
 try:
-    model = AutoModel.from_pretrained(
-        model_path,
-        torch_dtype=torch.bfloat16, # Use bfloat16 for efficiency
-        trust_remote_code=True,
-        # device_map='auto' # Consider if running into OOM errors, might split across GPUs/CPU
-    ).to(device).eval()
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_path,
-        trust_remote_code=True
-    )
-    print("Model and tokenizer loaded successfully.")
-except Exception as e:
-    print(f"Error loading model or tokenizer: {e}")
-    print("Please ensure you have enough GPU memory and the model files are accessible.")
-    # Exit or raise if loading fails
-    raise e
-# --- Constants for DREAM ---
-# Find the mask token and ID from the DREAM tokenizer
-if tokenizer.mask_token is None:
-    print("Warning: Mask token not found in tokenizer. Attempting to add '[MASK]'.")
-    # This might require retraining or fine-tuning if the model didn't see this token
-    num_added = tokenizer.add_special_tokens({'mask_token': '[MASK]'})
-    if num_added > 0:
-        print(f"Added '{tokenizer.mask_token}' to tokenizer.")
-        # Resize model embeddings if vocab changed
-        model.resize_token_embeddings(len(tokenizer))
-        print("Resized model token embeddings.")
-    else:
-        # Fallback or error if adding failed or mask token still None
-        # It's possible a different token serves this purpose in DREAM's training
-        print("Error: Could not set a mask token. Visualization might be inaccurate.")
-        # You might need to identify which token ID DREAM uses internally for masking tasks
-        # For now, we'll proceed but this is a potential issue.
-        MASK_TOKEN = "<?>" # Placeholder symbol
-        MASK_ID = -1 # Invalid ID indicates issue
-    if tokenizer.mask_token is None:
-         raise ValueError("Could not set a mask token for the tokenizer.")
-MASK_TOKEN = tokenizer.mask_token
-MASK_ID = tokenizer.mask_token_id
-print(f"Using MASK_TOKEN='{MASK_TOKEN}' with ID={MASK_ID}")
-# Identify other special tokens to potentially hide/show
-eos_token_id = tokenizer.eos_token_id
-pad_token_id = tokenizer.pad_token_id
-special_token_ids_set = {MASK_ID} # Start with Mask ID
-if eos_token_id is not None:
-    special_token_ids_set.add(eos_token_id)
-    print(f"EOS token ID: {eos_token_id} ({tokenizer.decode([eos_token_id])})")
-if pad_token_id is not None:
-    special_token_ids_set.add(pad_token_id)
-    print(f"PAD token ID: {pad_token_id} ({tokenizer.decode([pad_token_id])})")
-# Add other common special tokens if needed (e.g., BOS, UNK)
-if tokenizer.bos_token_id is not None:
-    special_token_ids_set.add(tokenizer.bos_token_id)
-    print(f"BOS token ID: {tokenizer.bos_token_id} ({tokenizer.decode([tokenizer.bos_token_id])})")
-if tokenizer.unk_token_id is not None:
-    special_token_ids_set.add(tokenizer.unk_token_id)
-    print(f"UNK token ID: {tokenizer.unk_token_id} ({tokenizer.decode([tokenizer.unk_token_id])})")
-print(f"Identified special token IDs: {special_token_ids_set}")
-# --- Helper Functions (Constraint Parsing, History Formatting) ---
 def parse_constraints(constraints_text):
     """Parse constraints in format: 'position:word, position:word, ...'"""
@@ -100,674 +66,494 @@ def parse_constraints(constraints_text):
     parts = constraints_text.split(',')
     for part in parts:
-        part = part.strip() # Trim whitespace
         if ':' not in part:
             continue
         try:
             pos_str, word = part.split(':', 1)
             pos = int(pos_str.strip())
             word = word.strip()
-            # Allow empty words if needed? Forcing empty seems odd. Let's require a word.
             if word and pos >= 0:
-                constraints[pos] = word
         except ValueError:
-            print(f"Warning: Could not parse constraint part: '{part}'")
             continue
     return constraints
 def format_chat_history(history):
     """
-    Format chat history for the DREAM model (standard messages format)
     Args:
         history: List of [user_message, assistant_message] pairs
     Returns:
-        Formatted conversation for the model (list of dictionaries)
     """
     messages = []
-    # Check if a system prompt is appropriate for Dream-Instruct
-    # From demo_completion.py example, it seems it uses system prompt via template
-    # messages.append({"role": "system", "content": "You are a helpful assistant."})
     for user_msg, assistant_msg in history:
-        if user_msg is not None: # Handle potential None message if clearing failed
              messages.append({"role": "user", "content": user_msg})
-        if assistant_msg:  # Skip if None (for the latest user message awaiting response)
             messages.append({"role": "assistant", "content": assistant_msg})
     return messages
-# --- Core Generation Logic for DREAM with Visualization ---
-@gpu_check # Use the potentially dummy decorator
-@torch.no_grad() # Disable gradient calculations for inference
-def dream_generate_response_with_visualization(
-    messages,
-    gen_length=128,
-    steps=128, # Default based on DREAM examples
     constraints=None,
-    temperature=0.6, # Default based on DREAM examples
-    top_p=0.95, # Default based on DREAM examples
-    alg="entropy", # Default based on DREAM examples
-    alg_temp=0.1, # Default based on DREAM examples
 ):
     """
-    Generate text with DREAM model with visualization using the generation hook.
     Args:
-        messages: List of message dictionaries with 'role' and 'content'
-        gen_length: Length of text to generate (max_new_tokens)
-        steps: Number of diffusion steps
-        constraints: Dictionary mapping positions (relative to response start) to words
-        temperature: Sampling temperature
-        top_p: Nucleus sampling p
-        alg: Remasking algorithm ('origin', 'maskgit_plus', 'topk_margin', 'entropy')
-        alg_temp: Temperature for confidence-based algorithms
     Returns:
         Tuple: (List of visualization states, final generated text string)
     """
-    print("\n--- Starting DREAM Generation ---")
-    print(f"Params: len={gen_length}, steps={steps}, temp={temperature}, top_p={top_p}, alg='{alg}', alg_temp={alg_temp}")
-    print(f"Constraints: {constraints}")
-    # --- Input Preparation ---
     if constraints is None:
         constraints = {}
-    # Convert word constraints to token IDs (handle multi-token words)
-    processed_constraints = {}
-    constraint_token_lengths = {} # Store length for multi-token constraints
-    print("Processing constraints:")
-    for pos, word in constraints.items():
-        # Prepend space for potentially better tokenization consistency
-        # (though apply_chat_template should handle spacing)
-        tokens = tokenizer.encode(" " + word, add_special_tokens=False)
-        if not tokens:
-            print(f"  Warning: Could not tokenize constraint word '{word}' at position {pos}. Skipping.")
-            continue
-        print(f"  Pos {pos}, Word '{word}' -> Tokens {tokens} ({tokenizer.convert_ids_to_tokens(tokens)})")
-        constraint_token_lengths[pos] = len(tokens)
-        for i, token_id in enumerate(tokens):
-            target_pos = pos + i
-            if target_pos in processed_constraints:
-                 print(f"  Warning: Overlapping constraint token at position {target_pos}. Keeping first constraint's token ({processed_constraints[target_pos]}).")
-            else:
-                 processed_constraints[target_pos] = token_id
     # Prepare the prompt using chat template
     try:
         inputs = tokenizer.apply_chat_template(
             messages,
             return_tensors="pt",
-            return_dict=True,
-            add_generation_prompt=True # Crucial for Dream-Instruct
         )
-        input_ids = inputs.input_ids.to(device=device)
-        # Use the attention mask generated by the template
-        attention_mask = inputs.attention_mask.to(device=device)
-        prompt_length = input_ids.shape[1]
-        print(f"Input prompt length: {prompt_length}")
-        # print(f"Input IDs: {input_ids}")
-        # print(f"Attention Mask: {attention_mask}") # Verify mask covers prompt
     except Exception as e:
         print(f"Error applying chat template: {e}")
-        return [([("Error applying chat template.", "red")],)], f"Error: {e}"
-    # Check context length (DREAM uses 2048 default)
-    model_max_length = getattr(model.config, 'max_position_embeddings', 2048)
-    if prompt_length + gen_length > model_max_length:
-         print(f"Warning: Requested length ({prompt_length + gen_length}) exceeds model max length ({model_max_length}). Truncating gen_length.")
-         gen_length = model_max_length - prompt_length
-         if gen_length <= 0:
-             print("Error: Prompt is already too long.")
-             return [([("Prompt too long.", "red")],)], "Error: Prompt too long."
-    # --- State for Visualization Hook ---
-    visualization_states = []
-    last_x = None # Store the full sequence (prompt + generation) from the previous step
-    # Initial state: Prompt + all masks for generation part
-    initial_gen_part = torch.full((1, gen_length), MASK_ID, dtype=torch.long, device=device)
-    # Apply initial constraints to the masked part *before* the first visualization state
-    for pos, token_id in processed_constraints.items():
-        absolute_pos = pos # Position relative to start of generation
-        if 0 <= absolute_pos < gen_length:
-            initial_gen_part[0, absolute_pos] = token_id
-    # Create the first visualization state (only the generation part)
-    initial_state_vis = []
-    for i in range(gen_length):
-        token_id = initial_gen_part[0, i].item()
-        if token_id == MASK_ID:
-            initial_state_vis.append((MASK_TOKEN, "#444444")) # Mask color
-        else:
-            # This must be a constraint applied initially
-            # Decode without skipping special to see raw constraint if needed
-            token_str = tokenizer.decode([token_id], skip_special_tokens=False).strip()
-            initial_state_vis.append((token_str if token_str else "?", "#800080")) # Constraint color (purple)
-    visualization_states.append(initial_state_vis)
     # --- Define the Hook Function ---
     def generation_tokens_hook_func(step, x, logits):
-        nonlocal last_x, visualization_states # Allow modification of outer scope variables
-        # print(f"Hook step {step}") # Keep console less noisy
-        current_x = x.clone() # Full sequence (prompt + generation) at this step
-        # 1. Apply Constraints to the current sequence
-        constrained_x = current_x.clone()
-        current_prompt_len = current_x.shape[1] - gen_length # Recalculate prompt length based on current x
-        if current_prompt_len < 0:
-            print(f"Warning: prompt_len {current_prompt_len} negative in hook step {step}, skipping constraints/vis.")
-            return current_x # Return unmodified if something is wrong
-        for pos, token_id in processed_constraints.items():
-            # pos is relative to the start of the *generation* part
-            absolute_pos = current_prompt_len + pos
-            # Ensure position is within the bounds of the *current* sequence 'x'
-            if current_prompt_len <= absolute_pos < current_x.shape[1]:
-                if constrained_x[0, absolute_pos] != token_id:
-                    constrained_x[0, absolute_pos] = token_id
-                    # print(f"  Constraint enforced at pos {pos} ({absolute_pos}) -> {token_id}")
-        # 2. Generate Visualization State for *this* step (generation part only)
-        current_state_vis = []
-        # Compare current_x (before explicit constraint application in *this* hook call)
-        # with last_x (state from *previous* hook call / initial state)
-        gen_part_current = current_x[0, current_prompt_len:]
-        # Ensure last_x exists and has the same shape for comparison
-        gen_part_last = last_x[0, current_prompt_len:] if (last_x is not None and last_x.shape == current_x.shape) else None
-        for i in range(gen_length):
-             # Ensure index i is valid for the current generation part
-            if i >= gen_part_current.shape[0]:
-                 print(f"Warning: Index {i} out of bounds for gen_part_current (shape {gen_part_current.shape}) in step {step}.")
-                 continue # Skip if index is invalid
-            current_token_id = gen_part_current[i].item()
-             # Handle case where last_x was None or had different shape
-            last_token_id = gen_part_last[i].item() if gen_part_last is not None and i < gen_part_last.shape[0] else MASK_ID # Assume mask initially
-            is_constrained = i in processed_constraints
-            is_special = current_token_id in special_token_ids_set
-            is_mask = current_token_id == MASK_ID
-            was_mask = last_token_id == MASK_ID or last_x is None # Treat first step as coming from mask
-            display_token = ""
-            color = ""
-            # Determine display token and color based on state transitions
-            if is_mask:
-                display_token = MASK_TOKEN
-                color = "#444444" # Dark Gray
-            elif is_constrained and processed_constraints[i] == current_token_id:
-                # Always show the constrained token, color purple
-                # Decide whether to show raw special tokens when constrained
-                raw_decode = tokenizer.decode([current_token_id], skip_special_tokens=False).strip()
-                display_token = raw_decode if raw_decode else "?"
-                color = "#800080" # Purple
-            elif is_special:
-                if was_mask:
-                    # Newly revealed special token: Show its representation once
-                    display_token = tokenizer.decode([current_token_id], skip_special_tokens=False).strip() # Show raw special
-                    color = "#FF8C00" # DarkOrange
                 else:
-                    # Already revealed special token: Hide it by showing a space
-                    display_token = " " # Effectively hides it
-                    color = "#6699CC" # Use 'Old' color (Light Blue) but content is hidden space
-            elif was_mask:
-                # Newly revealed normal token
-                display_token = tokenizer.decode([current_token_id], skip_special_tokens=True).strip()
-                color = "#66CC66" # Light Green
-            else:
-                # Previously revealed normal token
-                display_token = tokenizer.decode([current_token_id], skip_special_tokens=True).strip()
-                color = "#6699CC" # Light Blue
-            # Fallback for empty decodes of non-special, non-mask tokens
-            if not display_token and not is_mask and not (is_special and not was_mask):
-                 display_token = "?" # Use question mark for unexpected empty decodes
-            current_state_vis.append((display_token, color))
-        visualization_states.append(current_state_vis)
-        # 3. Update last_x for the *next* step's comparison
-        # Store the state *after* applying constraints for accurate comparison next time
-        last_x = constrained_x.clone()
-        # 4. Return the sequence with constraints applied for the model's next step
-        return constrained_x # Return the sequence with constraints enforced
-    # --- Run DREAM Generation ---
     try:
-        print("Calling model.diffusion_generate...")
-        # Make sure last_x is initialized correctly before the first hook call
-        # It should represent the state *before* the first diffusion step.
-        initial_full_x = torch.cat([input_ids, initial_gen_part], dim=1)
-        last_x = initial_full_x.clone() # Initialize last_x with prompt + initial masked/constrained gen part
         output = model.diffusion_generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask, # Pass the correct attention mask
-            max_new_tokens=gen_length,
-            output_history=False, # We build history in the hook
             return_dict_in_generate=True,
             steps=steps,
             temperature=temperature,
             top_p=top_p,
             alg=alg,
-            # alg_temp is only relevant for confidence-based algs (not 'origin')
-            alg_temp=alg_temp if alg != "origin" else 0.0,
             generation_tokens_hook_func=generation_tokens_hook_func
-            # Ensure generation doesn't run past eos_token if not desired
-            # eos_token_id=eos_token_id, # This might stop generation early
-            # pad_token_id=tokenizer.eos_token_id # Often pad is same as eos for LLMs
         )
-        print("model.diffusion_generate finished.")
-        # Extract final generated sequence (response part only)
-        # The hook ensures the returned sequence has constraints applied
-        final_sequence = output.sequences[0]
-        # Handle potential length mismatch if generation stopped early
-        actual_gen_len = final_sequence.shape[0] - prompt_length
-        response_token_ids = final_sequence[prompt_length:]
-        # Decode the final response, skipping special tokens like EOS/PAD
-        final_text = tokenizer.decode(
-            response_token_ids,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=True # Recommended for cleaner output
-        ).strip()
-        print(f"Final generated text: '{final_text}'")
-        # Add the very final state to visualization if the hook didn't capture it
-        # (Mainly a safeguard, hook should run 'steps' times or until completion)
-        if len(visualization_states) <= steps: # Hook might run 'steps' times
-             final_state_vis = []
-             final_gen_part = response_token_ids # Use the extracted response tokens
-             for i in range(len(final_gen_part)): # Iterate over actual generated tokens
-                 token_id = final_gen_part[i].item()
-                 is_constrained = i in processed_constraints
-                 is_special = token_id in special_token_ids_set
-                 is_mask = token_id == MASK_ID # Should not happen in final output
-                 display_token = ""
-                 color = ""
-                 if is_mask: color = "#444444"; display_token = MASK_TOKEN
-                 elif is_constrained and processed_constraints.get(i) == token_id:
-                      raw_decode = tokenizer.decode([token_id], skip_special_tokens=False).strip()
-                      display_token = raw_decode if raw_decode else "?"; color = "#800080" # Purple
-                 elif is_special:
-                      # Hide special tokens in the *final* display state for cleaner look
-                      display_token = " "; color = "#6699CC" # Hide as 'Old' blue
-                 else:
-                      display_token = tokenizer.decode([token_id], skip_special_tokens=True).strip()
-                      color = "#6699CC" # Final state uses 'Old' blue
-                 if not display_token: display_token = "?" # Fallback
-                 final_state_vis.append((display_token, color))
-             # Pad the final state visualization if actual gen len < requested gen_length
-             # This shouldn't be necessary if HighlightedText handles shorter lists
-             # while len(final_state_vis) < gen_length:
-             #     final_state_vis.append((" ", "#FFFFFF")) # Add empty space
-             if final_state_vis: # Only append if we generated something
-                 visualization_states.append(final_state_vis)
     except Exception as e:
-        print(f"\n--- Error during generation ---")
         import traceback
         traceback.print_exc()
-        # Add error message to visualization
-        error_msg = f"Generation Error: Check Logs"
-        # Append error to visualization states if possible
-        visualization_states.append([("Error", "red")])
-        final_text = f"Generation failed: {e}"
-    print("--- DREAM Generation Finished ---\n")
-    # Ensure we always return a list (even if empty) and a string
-    if not isinstance(visualization_states, list): visualization_states = []
-    if not isinstance(final_text, str): final_text = str(final_text)
-    return visualization_states, final_text
-# --- Gradio UI Setup ---
 css = '''
 .category-legend{display:none}
-/* Increase overall base font size */
-body, .gradio-container { font-size: 105%; }
-/* Make buttons slightly larger */
-/* button { min-height: 40px; } */
-.small_btn {
-    min-width: 60px; /* Adjust as needed */
-    max-width: 100px;
-    height: 42px;    /* Adjust height */
-    flex-grow: 0 !important; /* Prevent button from growing */
-    margin-left: 5px !important; /* Add some space */
-    font-size: 100%; /* Match button font size */
-    padding: 0 10px !important; /* Adjust padding */
-}
-.chat-input-row {
-    display: flex;
-    align-items: center; /* Vertically align items */
-    margin-top: 10px; /* Add space above input row */
-}
-/* Ensure Textbox takes up space */
-.chat-input-row .gr-textbox {
-    flex-grow: 1;
-    margin-right: 5px;
-}
-/* Chatbot styling */
-.gr-chatbot .message {
-    font-size: 100%; /* Ensure chat message font size is reasonable */
-    padding: 10px !important;
-    border-radius: 8px !important;
-}
-.gr-chatbot .message.user { background-color: #E0F7FA !important; align-self: flex-end; } /* Light cyan for user */
-.gr-chatbot .message.bot { background-color: #F1F8E9 !important; align-self: flex-start; } /* Light green for bot */
-/* HighlightedText styling */
-.gr-highlightedtext span {
-    padding: 1px 2px; /* Minimal padding */
-    margin: 0 1px; /* Minimal margin */
-    border-radius: 3px;
-    font-family: monospace; /* Use monospace font for better alignment */
-    font-size: 95%; /* Slightly smaller font for dense vis */
-    line-height: 1.4; /* Adjust line spacing */
-}
-.gr-highlightedtext {
-    padding: 10px;
-    border: 1px solid #E0E0E0;
-    border-radius: 5px;
-    background-color: #FAFAFA; /* Light background for the container */
-}
-/* Legend Styling */
-.legend {
-    font-size: 90%;
-    margin-top: 5px;
-    color: #555;
-}
-.legend span {
-    display: inline-block; /* Keep legend items inline */
-    margin-right: 10px;
-    white-space: nowrap; /* Prevent wrapping */
-}
-.legend span::before { /* Style the color square */
-    content: '■';
-    display: inline-block;
-    margin-right: 4px;
-    font-size: 120%; /* Make square slightly larger */
-    vertical-align: middle; /* Align square with text */
-}
 '''
 def create_chatbot_demo():
-    with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
-        gr.Markdown("## Dream 7B - Diffusion Language Model Demo")
-        gr.Markdown("Interact with the Dream 7B instruction-tuned model and watch the diffusion process unfold step-by-step. "
-                    "You can optionally constrain specific words at certain positions.")
-        with gr.Row():
-            gr.Markdown("[Model Card](https://huggingface.co/Dream-org/Dream-v0-Instruct-7B)", scale=1)
-            gr.Markdown("[Blog Post](https://hkunlp.github.io/blog/2025/dream/)", scale=1)
         # STATE MANAGEMENT
-        chat_history = gr.State([]) # Stores conversation [[user, bot], ...]
-        # UI LAYOUT
         with gr.Row():
-            # Left Column: Chat Interface
             with gr.Column(scale=3):
-                chatbot_ui = gr.Chatbot(
-                    label="Conversation",
-                    height=550,
-                    bubble_full_width=False,
-                    show_copy_button=True,
-                    render=False # Rendered explicitly later for streaming
-                 )
-                chatbot_ui.render() # Manually render after setting parameters
-                # Message input Row
-                with gr.Row(elem_classes="chat-input-row"):
                         user_input = gr.Textbox(
                             label="Your Message",
-                            placeholder="Type your message and press Enter, or click Send...",
-                            scale=4, # Give textbox more space relative to button
-                            container=False,
-                            show_label=False
                         )
-                        send_btn = gr.Button("Send", scale=1, elem_classes="small_btn", variant="primary")
                 constraints_input = gr.Textbox(
                     label="Word Constraints (Optional)",
-                    info="Force words at positions (0-indexed from response start). Format: 'pos:word, pos:word'. Example: '0:Once, 5:upon, 10:time'",
-                    placeholder="e.g., 0:Hello, 6:world",
-                    lines=1
                 )
-            # Right Column: Visualization and Settings
             with gr.Column(scale=2):
-                gr.Markdown("### Denoising Process Visualization")
                 output_vis = gr.HighlightedText(
-                    label="Generation Steps",
-                    show_label=False, # Label provided by Markdown above
                     combine_adjacent=False,
-                    show_legend=False, # Using custom HTML legend below
-                    # color_map is not directly used due to show_legend=False, but useful for reference
-                    color_map={
-                        "Mask": "#444444",
-                        "New": "#66CC66",
-                        "Old": "#6699CC",
-                        "Constraint": "#800080",
-                        "Special (New)": "#FF8C00",
-                        "Error": "red"
-                    }
                 )
-                # Custom HTML Legend
-                gr.HTML(
-                    """
-                    <div class='legend'>
-                        <span style="color:#444444;">■ Mask</span> |
-                        <span style='color:#66CC66;'>■ New</span> |
-                        <span style='color:#FF8C00;'>■ Special (New)</span> |
-                        <span style='color:#6699CC;'>■ Old</span> |
-                        <span style='color:#800080;'>■ Constraint</span>
-                    </div>
-                    """,
-                    elem_id="legend-html"
                 )
-                # Generation Settings Accordion
-                with gr.Accordion("Generation Settings", open=False):
-                    gen_length = gr.Slider(
-                        minimum=16, maximum=512, value=128, step=16,
-                        label="Max New Tokens", info="Max response length."
-                    )
-                    steps = gr.Slider(
-                        minimum=8, maximum=512, value=128, step=8,
-                        label="Diffusion Steps", info="More steps = finer generation (potentially slower)."
-                    )
-                    temperature = gr.Slider(
-                        minimum=0.0, maximum=1.5, value=0.6, step=0.05,
-                        label="Temperature", info="Controls randomness. Lower=more deterministic."
-                    )
-                    top_p = gr.Slider(
-                        minimum=0.0, maximum=1.0, value=0.95, step=0.05,
-                        label="Top-P (Nucleus)", info="Filters vocabulary probabilistically. Lower=less diverse."
-                    )
-                    # Map UI choices to DREAM's alg parameters
-                    remasking_strategy = gr.Radio(
-                        choices=[
-                            ("Random", "origin"), # User friendly name -> actual param
-                            ("Entropy", "entropy"),
-                            ("MaskGit+", "maskgit_plus"),
-                            ("TopK Margin", "topk_margin"),
-                        ],
-                        value="entropy", # Default
-                        label="Generation Order Strategy (alg)",
-                        info="How the model decides which tokens to generate first."
-                    )
-                    alg_temp = gr.Slider(
-                        minimum=0.0, maximum=1.0, value=0.1, step=0.05,
-                        label="Order Randomness (alg_temp)" ,
-                        info="Adds randomness to confidence-based strategies (Entropy, MaskGit+, TopK). Ignored for Random."
-                    )
-                    visualization_delay = gr.Slider(
-                        minimum=0.0, maximum=0.5, value=0.05, step=0.01,
-                        label="Visualization Delay (sec)", info="Pause between steps in visualization."
-                    )
-        # Clear button Row
-        with gr.Row():
-            clear_btn = gr.Button("Clear Conversation", variant="stop", icon="🗑️")
-        # --- Event Handlers ---
-        # Helper to add message to history state
-        def add_message_to_history(history_state, user_message, bot_message):
-            # history_state is the raw list from gr.State
-            history_state.append([user_message, bot_message])
-            return history_state
-        # Function when user submits message (Enter or Send button)
-        def handle_user_message(message, history_state):
-            print(f"User submitted: '{message}'")
-            if not message or not message.strip():
-                 print("Empty message submitted, doing nothing.")
-                 # Return unchanged state if message is empty
-                 # Need to return values for all outputs of the .submit/.click
-                 # history_state, chatbot_ui, user_input, output_vis
-                 return history_state, history_state, "", [] # No change to chatbot UI yet
-            # Add user message to history state (with None for bot response initially)
-            updated_history_state = add_message_to_history(history_state, message, None)
-            # Prepare updated history for display in Chatbot UI
-            # We only display the user message now, bot response comes later
-            chatbot_display = updated_history_state.copy()
-            # Clear the input textbox and visualization
-            return updated_history_state, chatbot_display, "", []
-        # Function to generate bot response (triggered after user message is handled)
-        # Uses yield for streaming visualization updates
-        def generate_bot_response(
-            history_state, # The current state [[user, None], ...]
-            gen_length_val, steps_val, constraints_text, delay_val,
-            temperature_val, top_p_val, alg_val, alg_temp_val
-            ):
-            print("\n--- Streaming Bot Response ---")
-            if not history_state or history_state[-1][1] is not None:
-                print("History empty or last message already has response. Skipping generation.")
-                # Yield current state if called unnecessarily
-                yield history_state, [] # Chatbot UI, Visualization
-                return
-            # Get the conversation history in the format the model expects
-            messages_for_model = format_chat_history(history_state) # Includes the latest user query
-            # Parse constraints from the textbox
-            parsed_constraints = parse_constraints(constraints_text)
-            # Generate response with visualization (this function handles the core logic)
-            vis_states, response_text = dream_generate_response_with_visualization(
-                messages_for_model,
-                gen_length=gen_length_val,
-                steps=steps_val,
-                constraints=parsed_constraints,
-                temperature=temperature_val,
-                top_p=top_p_val,
-                alg=alg_val,
-                alg_temp=alg_temp_val
             )
-            # Update the history state with the final bot response (critical!)
-            history_state[-1][1] = response_text.strip()
-            # Stream the updates
-            if vis_states:
-                 # Yield the initial visualization state first
-                 yield history_state, vis_states[0] # Update chatbot UI (implicitly via history), update visualization
-                 # Then animate through the rest of the visualization states
-                 for state in vis_states[1:]:
-                     time.sleep(delay_val)
-                     yield history_state, state # Update chatbot UI, update visualization
-            else:
-                 # Handle case where generation failed or produced no visualization
-                 print("Warning: No visualization states generated.")
-                 yield history_state, [("No visualization generated.", "orange")] # Update chatbot UI, show warning in vis
-            print("--- Streaming Complete ---")
-        # Function to clear everything
-        def clear_conversation_state():
-            print("Clearing conversation.")
-            # Reset state and UI components
-            return [], [], "", [] # chat_history (State), chatbot_ui, user_input, output_vis
-        # --- Wire UI elements to functions ---
-        # Define shared inputs for generation to avoid repetition
-        generation_inputs = [
-            chat_history, gen_length, steps, constraints_input, visualization_delay,
-            temperature, top_p, remasking_strategy, alg_temp
-        ]
-        # Define shared outputs for streaming
-        streaming_outputs = [chatbot_ui, output_vis]
-        # Typing in Textbox and pressing Enter
-        user_input.submit(
-            fn=handle_user_message,
-            inputs=[user_input, chat_history],
-            outputs=[chat_history, chatbot_ui, user_input, output_vis], # Update history state, chatbot display, clear input, clear vis
-            queue=False # Process user input immediately
-        ).then(
-            fn=generate_bot_response,
-            inputs=generation_inputs,
-            outputs=streaming_outputs, # Stream updates to chatbot and visualization
-            #api_name="generate_stream" # Optional: Name for API endpoint
-        )
-        # Clicking the Send button
-        send_btn.click(
-            fn=handle_user_message,
-            inputs=[user_input, chat_history],
-            outputs=[chat_history, chatbot_ui, user_input, output_vis],
-            queue=False
-        ).then(
-            fn=generate_bot_response,
-            inputs=generation_inputs,
-            outputs=streaming_outputs,
-           # api_name="generate_stream_click" # Optional
-        )
-        # Clicking the Clear button
         clear_btn.click(
-            fn=clear_conversation_state,
             inputs=[],
-            outputs=[chat_history, chatbot_ui, user_input, output_vis],
-            queue=False # Clearing should be instant
         )
     return demo
-# --- Launch the Gradio App ---
 if __name__ == "__main__":
-    print("Creating Gradio demo...")
-    gradio_demo = create_chatbot_demo()
-    print("Launching Gradio demo...")
-    # Use queue() for handling concurrent users and potentially long generation times
-    # share=True generates a public link (useful for Colab/Spaces)
-    # debug=True provides helpful Gradio logs in the console
-    gradio_demo.queue().launch(share=True, debug=False) # Set debug=True for more verbose logs if needed

+# dream_app.py
 import torch
+import numpy as np
 import gradio as gr
 import spaces
 import time
+import re
+from transformers import AutoModel, AutoTokenizer
+from threading import Lock
+from queue import Queue
+# --- Configuration ---
+MODEL_PATH = "Dream-org/Dream-v0-Instruct-7B"
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+print(f"Using device: {DEVICE}")
+# --- Load Model and Tokenizer ---
+print("Loading model and tokenizer...")
+# Need configuration files for trust_remote_code
+# Make sure config.json, configuration_dream.py, modeling_dream.py,
+# generation_utils.py, generation_config.json are in the same directory
+# or accessible in the Hugging Face cache.
+model = AutoModel.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=torch.bfloat16,
+    trust_remote_code=True
+).to(DEVICE).eval()
+tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_PATH,
+    trust_remote_code=True
+)
+print("Model and tokenizer loaded.")
+# --- Constants ---
+# Get IDs from tokenizer/config if possible, otherwise hardcode from provided files
+MASK_TOKEN = tokenizer.mask_token # Should be "<|mask|>"
 try:
+    MASK_ID = tokenizer.mask_token_id # Should be 151666
+    if MASK_ID is None: raise AttributeError # Handle case where it might not be set directly
 except AttributeError:
+    print("Warning: Could not directly get mask_token_id, using hardcoded value 151666.")
+    MASK_ID = 151666
 try:
+    EOS_ID = tokenizer.eos_token_id # Should be 151643
+    PAD_ID = tokenizer.pad_token_id # Should be 151643
+    if EOS_ID is None or PAD_ID is None: raise AttributeError
+except AttributeError:
+    print("Warning: Could not directly get eos/pad_token_id, using hardcoded value 151643.")
+    EOS_ID = 151643
+    PAD_ID = 151643
+# Ensure MASK_TOKEN and MASK_ID are valid
+if MASK_TOKEN is None or MASK_ID is None:
+    raise ValueError("Mask token or ID is not defined correctly.")
+if EOS_ID is None or PAD_ID is None:
+     raise ValueError("EOS/PAD token or ID is not defined correctly.")
+# --- Helper Functions ---
 def parse_constraints(constraints_text):
     """Parse constraints in format: 'position:word, position:word, ...'"""
     parts = constraints_text.split(',')
     for part in parts:
         if ':' not in part:
             continue
         try:
             pos_str, word = part.split(':', 1)
             pos = int(pos_str.strip())
             word = word.strip()
             if word and pos >= 0:
+                # Tokenize the word - handle potential multi-token words
+                # Add space prefix for consistency, similar to how model might see words mid-sentence
+                tokens = tokenizer.encode(" " + word, add_special_tokens=False)
+                for i, token_id in enumerate(tokens):
+                     constraints[pos + i] = token_id
         except ValueError:
+            continue
+        except Exception as e:
+            print(f"Error parsing constraint part '{part}': {e}")
             continue
     return constraints
 def format_chat_history(history):
     """
+    Format chat history for the Dream model using its chat template logic.
     Args:
         history: List of [user_message, assistant_message] pairs
     Returns:
+        Formatted list of message dictionaries for the model
     """
     messages = []
+     # Add system prompt if history is empty or doesn't start with system
+    if not history or history[0][0].lower() != 'system':
+         # Check if the tokenizer's template expects an explicit system message
+         # The template provided in tokenizer_config.json handles adding a default one
+         pass # Let apply_chat_template handle the default system message
     for user_msg, assistant_msg in history:
+        if user_msg: # Handle potential initial system message possibility if needed
              messages.append({"role": "user", "content": user_msg})
+        if assistant_msg is not None: # Skip if None (for the latest user message)
             messages.append({"role": "assistant", "content": assistant_msg})
     return messages
+# --- Core Generation Logic with Visualization ---
+# Use a thread-safe queue to pass visualization states from the hook
+vis_queue = Queue()
+# Lock to prevent race conditions when accessing shared state like previous_x
+state_lock = Lock()
+# Store the previous state for comparison in the hook
+previous_x_shared = None
+@spaces.GPU
+def generate_response_with_visualization(
+    messages, # List of message dicts from format_chat_history
+    max_new_tokens=64,
+    steps=64, # Default steps based on README example
     constraints=None,
+    temperature=0.6, # Default from demo_token_control
+    top_p=0.95,      # Default from demos
+    alg="entropy",   # Default from demos
+    alg_temp=0.1,    # Default from demo_multiturn_chat
 ):
     """
+    Generate text with Dream model and capture visualization states using a hook.
     Args:
+        messages: List of message dictionaries with 'role' and 'content'.
+        max_new_tokens: Max tokens to generate.
+        steps: Diffusion steps.
+        constraints: Dictionary mapping positions (relative to response start) to token IDs.
+        temperature: Sampling temperature.
+        top_p: Nucleus sampling p.
+        alg: Remasking algorithm ('origin', 'entropy', 'maskgit_plus', 'topk_margin').
+        alg_temp: Temperature for confidence-based algorithms.
     Returns:
         Tuple: (List of visualization states, final generated text string)
     """
+    global previous_x_shared, vis_queue
     if constraints is None:
         constraints = {}
+    visualization_states = []
+    # Clear the queue for a new generation
+    while not vis_queue.empty():
+        try:
+            vis_queue.get_nowait()
+        except Queue.Empty:
+            break
     # Prepare the prompt using chat template
+    # The template automatically adds the generation prompt like "<|im_start|>assistant\n"
     try:
         inputs = tokenizer.apply_chat_template(
             messages,
             return_tensors="pt",
+            add_generation_prompt=True,
+            return_dict=True
         )
+        input_ids = inputs.input_ids.to(device=DEVICE)
+        # Dream doesn't seem to explicitly use attention_mask in simple demos,
+        # but it's good practice if padding were involved.
+        # For now, assume no padding in this interactive demo.
+        attention_mask = inputs.attention_mask.to(device=DEVICE) if 'attention_mask' in inputs else None
     except Exception as e:
         print(f"Error applying chat template: {e}")
+        # Provide a fallback or error state
+        error_state = [("Error in chat formatting.", "red")]
+        return [error_state], f"Error: Could not format chat history. {e}"
+    prompt_length = input_ids.shape[1]
+    total_length = prompt_length + max_new_tokens
     # --- Define the Hook Function ---
     def generation_tokens_hook_func(step, x, logits):
+        global previous_x_shared, vis_queue
+        with state_lock: # Ensure thread safety if needed, though hooks might run sequentially
+            current_x = x.clone() # Shape: (batch_size, total_length)
+            # --- Apply Constraints ---
+            # Constraints are relative to the start of the *response*
+            for rel_pos, token_id in constraints.items():
+                abs_pos = prompt_length + rel_pos
+                if 0 <= abs_pos < current_x.shape[1]:
+                    # Ensure constraint application doesn't go out of bounds
+                    # Apply constraint for the first batch element (batch size is 1 here)
+                    current_x[0, abs_pos] = token_id
+            # --- Create Visualization State ---
+            current_vis_state = []
+            x_response = current_x[0, prompt_length:] # Get the response part for batch 0
+            prev_x_response = previous_x_shared[0, prompt_length:] if previous_x_shared is not None else None
+            for i in range(max_new_tokens):
+                current_token_id = x_response[i].item()
+                token_str = tokenizer.decode([current_token_id], skip_special_tokens=False) # Keep special tokens for vis
+                # Clean up visual representation of special tokens
+                if token_str == tokenizer.eos_token or token_str == tokenizer.pad_token:
+                     token_str = "[EOS/PAD]" # Make it visually distinct
+                elif token_str == tokenizer.mask_token:
+                     token_str = "[MASK]"
+                elif token_str.strip() == "": # Handle empty strings from decoding potentially odd tokens
+                    token_str = "[UNK/SPACE]"
+                color = "#DDDDDD" # Default background
+                if current_token_id == MASK_ID:
+                    color = "#444444" # Dark gray for masks
+                elif prev_x_response is not None and prev_x_response[i].item() == MASK_ID:
+                    # Token was mask, now it's revealed in this step
+                     # Use green for newly revealed
+                    color = "#66CC66" # Light green
                 else:
+                     # Token was already revealed in a previous step or is a constraint
+                     # Check if it's a constraint applied *now*
+                     is_constraint = (prompt_length + i - prompt_length) in constraints and \
+                                      constraints[prompt_length + i - prompt_length] == current_token_id
+                     if is_constraint:
+                         color = "#FFD700" # Gold for constraints
+                     else:
+                         color = "#6699CC" # Light blue for previously revealed
+                current_vis_state.append((token_str, color))
+            # --- Update shared state and put vis state in queue ---
+            previous_x_shared = current_x.clone() # Update for the *next* step's comparison
+            vis_queue.put(current_vis_state)
+            # The hook must return the potentially modified tensor `x`
+            return current_x
+    # --- End of Hook Function ---
+    # Initialize previous_x_shared before generation starts
+    # Create initial masked state for visualization
+    initial_x = input_ids.clone()
+    if initial_x.shape[1] < total_length:
+         padding = torch.full((1, total_length - initial_x.shape[1]), MASK_ID, dtype=torch.long, device=DEVICE)
+         initial_x = torch.cat([initial_x, padding], dim=1)
+    else:
+         initial_x = initial_x[:, :total_length] # Truncate if prompt is too long
+    # Apply initial constraints to the starting state
+    for rel_pos, token_id in constraints.items():
+        abs_pos = prompt_length + rel_pos
+        if 0 <= abs_pos < initial_x.shape[1]:
+             initial_x[0, abs_pos] = token_id
+    with state_lock:
+        previous_x_shared = initial_x.clone()
+    # Add the initial all-masked state (or with constraints) to the visualization queue
+    initial_vis_state = []
+    initial_x_response = initial_x[0, prompt_length:]
+    for i in range(max_new_tokens):
+         token_id = initial_x_response[i].item()
+         if token_id == MASK_ID:
+              initial_vis_state.append((MASK_TOKEN, "#444444"))
+         else:
+              # Must be a pre-applied constraint
+              token_str = tokenizer.decode([token_id], skip_special_tokens=False)
+              if token_str == tokenizer.eos_token or token_str == tokenizer.pad_token:
+                   token_str = "[EOS/PAD]"
+              elif token_str.strip() == "":
+                   token_str = "[UNK/SPACE]"
+              initial_vis_state.append((token_str, "#FFD700")) # Gold for constraints
+    vis_queue.put(initial_vis_state)
+    # --- Run Generation ---
     try:
+        # output_history=False because the hook handles state capture
+        # return_dict_in_generate=True to get the GenerationOutput object
         output = model.diffusion_generate(
+            initial_x, # Start with the potentially constraint-applied tensor
+            attention_mask=None, # Assuming no padding needed for interactive use
+            max_new_tokens=max_new_tokens, # This might not be strictly needed if total_length is fixed
+            output_history=False,
             return_dict_in_generate=True,
             steps=steps,
             temperature=temperature,
             top_p=top_p,
             alg=alg,
+            alg_temp=alg_temp if alg != 'origin' else None, # alg_temp only for confidence algs
             generation_tokens_hook_func=generation_tokens_hook_func
         )
+        final_sequence = output.sequences[0] # Batch size 1
+        # Decode the final response text, cleaning up special tokens
+        response_tokens = final_sequence[prompt_length:]
+        # Filter out EOS/PAD tokens for the final text display
+        response_tokens_filtered = [tok for tok in response_tokens.tolist() if tok != EOS_ID and tok != PAD_ID]
+        final_text = tokenizer.decode(response_tokens_filtered,
+                                      skip_special_tokens=True,
+                                      clean_up_tokenization_spaces=True) # Standard cleanup
     except Exception as e:
+        print(f"Error during generation: {e}")
         import traceback
         traceback.print_exc()
+        # Provide error state
+        error_state = [("Generation Error.", "red")]
+        visualization_states.append(error_state)
+        final_text = f"Error: Generation failed. {e}"
+        # Add any states captured before the error
+        while not vis_queue.empty():
+             try:
+                 visualization_states.append(vis_queue.get_nowait())
+             except Queue.Empty:
+                 break
+        return visualization_states, final_text
+    # Retrieve all visualization states captured by the hook
+    while not vis_queue.empty():
+        try:
+            visualization_states.append(vis_queue.get_nowait())
+        except Queue.Empty:
+            break
+    # If somehow no states were captured, add the initial one
+    if not visualization_states:
+         visualization_states.append(initial_vis_state)
+    return visualization_states, final_text.strip()
+# --- Gradio UI ---
 css = '''
 .category-legend{display:none}
+button{height: 60px}
 '''
 def create_chatbot_demo():
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown("# Dream 7B - Diffusion Language Model Demo")
+        gr.Markdown("Chat with the Dream 7B Instruct model and visualize the diffusion generation process.")
+        gr.Markdown("Model: [Dream-org/Dream-v0-Instruct-7B](https://huggingface.co/Dream-org/Dream-v0-Instruct-7B)")
         # STATE MANAGEMENT
+        chat_history = gr.State([])
+        # UI COMPONENTS
         with gr.Row():
             with gr.Column(scale=3):
+                chatbot_ui = gr.Chatbot(label="Conversation", height=500, avatar_images=["user.png", "robot.png"])
+                # Message input
+                with gr.Group():
+                    with gr.Row():
                         user_input = gr.Textbox(
                             label="Your Message",
+                            placeholder="Type your message here...",
+                            show_label=False,
+                            scale=9
                         )
+                        send_btn = gr.Button("Send", scale=1)
                 constraints_input = gr.Textbox(
                     label="Word Constraints (Optional)",
+                    info="Place words at specific positions (0-indexed from response start). Format: 'pos:word, pos:word,...'. Example: '0:Once, 5:upon, 10:a'",
+                    placeholder="0:Once, 5:upon, 10:a",
+                    value=""
                 )
             with gr.Column(scale=2):
                 output_vis = gr.HighlightedText(
+                    label="Diffusion Process Visualization",
                     combine_adjacent=False,
+                    show_legend=True, # Keep legend hidden via CSS if desired
+                    height=560 # Adjust height to match chatbot area
                 )
+                # Legend (colors defined in generate_response_with_visualization)
+                gr.Markdown(
+                    "<small>Color Legend: <span style='background-color:#444444; color:white;'>[MASK]</span>"
+                    " <span style='background-color:#66CC66;'>Newly Revealed</span>"
+                    " <span style='background-color:#6699CC;'>Previously Revealed</span>"
+                    " <span style='background-color:#FFD700;'>Constraint</span>"
+                     " <span style='background-color:#DDDDDD;'>[EOS/PAD/UNK]</span></small>"
                 )
+        # Advanced generation settings
+        with gr.Accordion("Generation Settings", open=False):
+            max_new_tokens_slider = gr.Slider(
+                minimum=16, maximum=512, value=128, step=16, # Increased default/max
+                label="Max New Tokens (Generation Length)"
+            )
+            steps_slider = gr.Slider(
+                minimum=8, maximum=512, value=128, step=8,   # Increased default/max
+                label="Diffusion Steps"
+            )
+            temp_slider = gr.Slider(
+                minimum=0.0, maximum=1.0, value=0.6, step=0.05, # Finer steps for temp
+                label="Temperature"
+            )
+            top_p_slider = gr.Slider(
+                minimum=0.0, maximum=1.0, value=0.95, step=0.05,
+                label="Top-P (Nucleus Sampling)"
+            )
+            alg_radio = gr.Radio(
+                # Choices from README
+                choices=['origin', 'entropy', 'maskgit_plus', 'topk_margin'],
+                value='entropy',
+                label="Remasking Algorithm"
+            )
+            alg_temp_slider = gr.Slider(
+                minimum=0.0, maximum=1.0, value=0.1, step=0.05,
+                label="Algorithm Temperature (for confidence-based algs)"
+            )
+            vis_delay_slider = gr.Slider(
+                minimum=0.0, maximum=0.5, value=0.03, step=0.01, # Faster default delay
+                label="Visualization Delay (seconds)"
             )
+        # Clear button
+        clear_btn = gr.Button("Clear Conversation")
+        # HELPER FUNCTIONS (UI Logic)
+        def add_message_to_history(history, message, response):
+            """Add a message pair to the history state"""
+            new_history = history + [[message, response]]
+            return new_history
+        def user_message_submitted(message, history):
+            """ Handle user sending a message: update history, clear input """
+            if not message or message.strip() == "":
+                return history, history, "", [] # No change if empty
+            # Add user message, response is initially None
+            new_history = add_message_to_history(history, message, None)
+            # Prepare display version (immediately shows user message)
+            display_history = new_history
+            # Clear input box
+            message_out = ""
+            # Clear visualization
+            vis_out = []
+            return new_history, display_history, message_out, vis_out
+        def bot_response_generator(history, constraints_str, max_tokens, steps, temp, top_p, alg, alg_temp, delay):
+            """ Generator function to stream bot response and visualization """
+            if not history or history[-1][1] is not None: # Ensure there's a user msg waiting for response
+                print("Warning: Bot response triggered without pending user message.")
+                yield history, [], "Error: No user message to respond to." # Send error state back?
+                return
+            # Get the full conversation history formatted for the model
+            last_user_message = history[-1][0]
+            messages_for_model = format_chat_history(history[:-1]) # History *before* the last user msg
+            messages_for_model.append({"role": "user", "content": last_user_message})
+            # Parse constraints
+            try:
+                parsed_constraints = parse_constraints(constraints_str)
+            except Exception as e:
+                 print(f"Error parsing constraints: {e}")
+                 yield history, [("Constraint Error", "red")], f"Error: Failed to parse constraints: {e}"
+                 return
+            # Generate response and visualization states
+            try:
+                 vis_states, final_response_text = generate_response_with_visualization(
+                    messages_for_model,
+                    max_new_tokens=max_tokens,
+                    steps=steps,
+                    constraints=parsed_constraints,
+                    temperature=temp,
+                    top_p=top_p,
+                    alg=alg,
+                    alg_temp=alg_temp
+                 )
+            except Exception as e:
+                print(f"Error in generate_response_with_visualization: {e}")
+                import traceback
+                traceback.print_exc()
+                yield history, [("Generation Error", "red")], f"Error: Generation failed: {e}"
+                return
+            # Update the history state with the final response *once*
+            history[-1][1] = final_response_text # Update the None placeholder
+            # Yield initial state immediately
+            if vis_states:
+                yield history, vis_states[0]
+            else:
+                 yield history, [] # Should not happen if generation worked
+            # Stream intermediate visualization states
+            for state in vis_states[1:]:
+                time.sleep(delay)
+                yield history, state
+            # Final yield ensures the chatbot UI has the complete history
+            # The last state in vis_states should already be yielded by the loop
+            # yield history, vis_states[-1] if vis_states else []
+        def clear_conversation():
+            """Clear the conversation history and visualization"""
+            return [], [], "", [] # history, chatbot_ui, user_input, output_vis
+        # EVENT HANDLERS
+        # User presses Enter or Send button
+        submit_args = {
+             "fn": user_message_submitted,
+             "inputs": [user_input, chat_history],
+             "outputs": [chat_history, chatbot_ui, user_input, output_vis]
+        }
+        user_input.submit(**submit_args)
+        send_btn.click(**submit_args)
+        # After user message is submitted, trigger bot response generation
+        generate_args = {
+            "fn": bot_response_generator,
+            "inputs": [
+                chat_history, constraints_input, max_new_tokens_slider, steps_slider,
+                temp_slider, top_p_slider, alg_radio, alg_temp_slider, vis_delay_slider
+            ],
+            "outputs": [chatbot_ui, output_vis] # Update chatbot history and visualization
+        }
+        # Trigger generation after submit OR click
+        user_input.submit(None, None, None, queue=True).then(**generate_args)
+        send_btn.click(None, None, None, queue=True).then(**generate_args)
+        # Clear button handler
         clear_btn.click(
+            fn=clear_conversation,
             inputs=[],
+            outputs=[chat_history, chatbot_ui, user_input, output_vis]
         )
     return demo
+# Launch the demo
 if __name__ == "__main__":
+    demo = create_chatbot_demo()
+    # queue() allows streaming and handling multiple users
+    # share=True creates a public link (use with caution)
+    demo.queue().launch(share=True, debug=True)