Spaces:

lucalp
/

blt-entropy-patcher

Running on Zero

App Files Files Community

lucalp commited on about 14 hours ago

Commit

8eea094

1 Parent(s): 5c58fd6

Fixing proper UTF-8 representation

Browse files

Files changed (2) hide show

app.py +154 -39
bytelatent/plotting/entropy_figure_via_matplot_lib.py +105 -55

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import spaces
 import math
 import os
@@ -31,7 +32,7 @@ class Config:
     # Bytelatent Specific
     BLT_WEIGHTS_DIR: str = "hf-weights"
-    BLT_MAX_BYTES_FOR_DEMO: float = math.inf if torch.cuda.is_available() else 512.0
     # Gradio
     DEFAULT_PROMPT: str = "Daenerys Targaryen is in Game of Thrones, a fantasy epic by George R.R. Martin."
@@ -119,45 +120,141 @@ class BytelatentProcessor:
              logging.warning("Skipping Bytelatent setup as libraries are unavailable.")
     def _create_highlight_data(self, patch_lengths: torch.Tensor, tokens: torch.Tensor) -> Tuple[List[Tuple[str, str]], int]:
-        """Generates data for gr.HighlightedText based on bytelatent patches."""
-        if not self.is_available or self.tokenizer is None or patch_lengths.numel() == 0:
-            return [("Bytelatent processing failed or produced no patches.", "Error")], 0
         patch_lengths_list = patch_lengths.tolist()
-        all_token_ids = tokens.tolist()
-        highlighted_data = []
-        current_token_index = 0
-        patch_count = 0
-        for i, length in enumerate(patch_lengths_list):
-            if length <= 0: continue
-            patch_token_ids = all_token_ids[current_token_index : current_token_index + length]
-            if not patch_token_ids: continue
-            try:
-                patch_text = self.tokenizer.decode(patch_token_ids)
-            except Exception as decode_err:
-                logging.warning(f"Bytelatent patch decoding failed: {decode_err}")
-                patch_text = f"[Decode Error: {len(patch_token_ids)} tokens]"
-            patch_label = f"BL Patch {i+1}"
-            highlighted_data.append((patch_text, patch_label))
-            patch_count += 1
-            current_token_index += length
-        # Handle remainder tokens if any
-        if current_token_index < len(all_token_ids):
-            remaining_tokens = all_token_ids[current_token_index:]
-            try:
-                remaining_text = self.tokenizer.decode(remaining_tokens)
-                label = "BL Remainder"
-            except Exception:
-                remaining_text = f"[Decode Error: {len(remaining_tokens)} remaining tokens]"
-                label = "Error"
-            highlighted_data.append((remaining_text, label))
-            logging.warning(f"Bytelatent token mismatch. Consumed {current_token_index}, total {len(all_token_ids)}. Remainder added.")
-        return highlighted_data, patch_count
     def process(self, prompt: str, max_bytes: float) -> Tuple[Optional[matplotlib.figure.Figure], List[Tuple[str, str]], int, str]:
         """Processes the prompt using the loaded Bytelatent model."""
@@ -204,7 +301,13 @@ class BytelatentProcessor:
         # Run Bytelatent patching
         try:
             logging.info(f"Running Bytelatent entropy model patching on {len(prompt_bl.encode('utf-8'))} bytes...")
-            results = patcher_nocache([prompt_bl], tokenizer=self.tokenizer, patcher=self.patcher)
             status += "Bytelatent patching executed.\n"
             if not results:
@@ -216,7 +319,12 @@ class BytelatentProcessor:
             patch_lengths, scores, tokens = batch_patch_lengths[0], batch_scores[0], batch_tokens[0]
             # Create highlighted text data
-            highlighted_data, patch_count = self._create_highlight_data(patch_lengths, tokens)
             # Create plot
             fig = None
@@ -228,7 +336,14 @@ class BytelatentProcessor:
                     logging.warning(f"Error decoding full BLT token sequence for plot: {decode_err}. Using (truncated) input prompt for plot axis.")
                     decoded_output_for_plot = prompt_bl
-                fig = plot_entropies(patch_lengths, scores, decoded_output_for_plot, threshold=self.patcher.threshold)
                 status += f"Bytelatent plot generated. Found {patch_count} patches.\n"
             else:
                  status += "Plotting unavailable.\n"
@@ -418,7 +533,7 @@ with gr.Blocks(theme=Config.GRADIO_THEME) as iface:
                 placeholder="Enter text here...",
                 # Max length is for UI input; Bytelatent truncation happens in backend
                 lines=5,
-                info="" if torch.cuda.is_available() else f"Note: Bytelatent processing is limited to ~{Config.BLT_MAX_BYTES_FOR_DEMO} bytes for this demo."
             )
             submit_button = gr.Button("Generate Visualizations", variant="primary")
             status_output = gr.Textbox(label="Processing Status", interactive=False, lines=10) # More space for detailed status

+from collections import defaultdict
 import spaces
 import math
 import os
     # Bytelatent Specific
     BLT_WEIGHTS_DIR: str = "hf-weights"
+    BLT_MAX_BYTES_FOR_DEMO: int = 512
     # Gradio
     DEFAULT_PROMPT: str = "Daenerys Targaryen is in Game of Thrones, a fantasy epic by George R.R. Martin."
              logging.warning("Skipping Bytelatent setup as libraries are unavailable.")
     def _create_highlight_data(self, patch_lengths: torch.Tensor, tokens: torch.Tensor) -> Tuple[List[Tuple[str, str]], int]:
+        """Generates data for gr.HighlightedText based on bytelatent patches,
+        formatting each byte's display text as 'char-byte_index'."""
+        if not self.is_available or self.tokenizer is None:
+            return [("Bytelatent processing unavailable.", "Error")], 0
+        if patch_lengths.numel() == 0 and tokens.numel() == 0: # No data at all
+             return [("No tokens or patches.", "Info")], 0
+        if tokens.numel() == 0: # No tokens to process
+            # Count patches even if no tokens, as per original logic for patch_count
+            actual_patch_count = 0
+            for length in patch_lengths.tolist():
+                if length > 0:
+                    actual_patch_count +=1
+            return [("No tokens provided to highlight.", "Info")], actual_patch_count
         patch_lengths_list = patch_lengths.tolist()
+        all_token_ids = tokens.tolist() # These are byte representations (integer IDs)
+        highlighted_data: List[Tuple[str, str]] = []
+        # Calculate original patch count (number of non-empty patches)
+        actual_patch_count = 0
+        for length in patch_lengths_list:
+            if length > 0:
+                actual_patch_count +=1
+        # Create a map from global token index to its patch label
+        token_to_patch_label = [""] * len(all_token_ids)
+        current_token_processed_for_patches = 0
+        patch_idx_counter = 0
+        for length in patch_lengths_list:
+            if length <= 0:
+                continue
+            patch_label = f"BL Patch {patch_idx_counter + 1}"
+            patch_idx_counter += 1
+            for _ in range(length):
+                if current_token_processed_for_patches < len(all_token_ids):
+                    token_to_patch_label[current_token_processed_for_patches] = patch_label
+                current_token_processed_for_patches += 1
+        # Handle remainder tokens label
+        if current_token_processed_for_patches < len(all_token_ids):
+            remainder_label = "BL Remainder"
+            logging.warning(
+                f"Bytelatent patch lengths sum ({current_token_processed_for_patches}) "
+                f"is less than total tokens ({len(all_token_ids)}). "
+                f"Remainder tokens will be labelled '{remainder_label}'."
+            )
+            for k in range(current_token_processed_for_patches, len(all_token_ids)):
+                token_to_patch_label[k] = remainder_label
+        elif current_token_processed_for_patches > len(all_token_ids) and len(all_token_ids) > 0 :
+             logging.warning(
+                f"Bytelatent patch lengths sum ({current_token_processed_for_patches}) "
+                f"exceeds total tokens ({len(all_token_ids)}). "
+                f"Patch label mapping might be affected."
+             )
+        global_token_idx = 0
+        while global_token_idx < len(all_token_ids):
+            char_representation = ""
+            decoded_byte_ids: List[int] = []
+            # Handle the special case for token ID 1, often representing '<' or similar
+            # This assumes token ID 1 should always be treated as a single character '<'.
+            # Adjust if your tokenizer handles ID 1 differently or if it can be part of a multi-byte sequence.
+            if all_token_ids[global_token_idx] == 1:
+                char_representation = "<" # As per user's original code snippet's implication
+                decoded_byte_ids = [1]
+            else:
+                # Iteratively try to decode a character (1 to 4 bytes for UTF-8)
+                for length_to_try in range(1, 5):
+                    if global_token_idx + length_to_try > len(all_token_ids):
+                        break # Not enough tokens left for this length
+                    current_ids_to_try = all_token_ids[global_token_idx : global_token_idx + length_to_try]
+                    try:
+                        temp_decode_text = self.tokenizer.decode(current_ids_to_try)
+                        if temp_decode_text: # Successfully decoded something
+                            # This means `current_ids_to_try` forms a valid character(s).
+                            # We take the first successful decode, assuming it's the shortest complete char.
+                            char_representation = temp_decode_text
+                            decoded_byte_ids = current_ids_to_try
+                            break # Found a character
+                    except Exception as e:
+                        # Decoding failed (e.g., incomplete sequence for this length_to_try).
+                        # Log this if it's unexpected for a particular tokenizer.
+                        # logging.debug(f"Decode attempt failed for {current_ids_to_try}: {e}")
+                        pass # Continue to try with more bytes.
+            # After trying to decode:
+            if char_representation and decoded_byte_ids:
+                num_bytes_in_char = len(decoded_byte_ids)
+                # Ensure char_representation is treated as a single conceptual unit here.
+                # If tokenizer.decode can return multiple characters for a short byte sequence,
+                # this might need adjustment. For UTF-8, one char is expected.
+                processed_char_text = char_representation.splitlines()[0] # Take first char if multiple, or clean up
+                for j in range(num_bytes_in_char):
+                    current_byte_abs_idx = global_token_idx + j
+                    # Boundary check, though loop structure should prevent out-of-bounds
+                    if current_byte_abs_idx < len(all_token_ids):
+                        label = token_to_patch_label[current_byte_abs_idx] if current_byte_abs_idx < len(token_to_patch_label) else "Error: Label Missing"
+                        display_text = f"{processed_char_text}-{j+1}".replace(" ", "_")
+                        highlighted_data.append((display_text, label))
+                    else: # Should ideally not be reached
+                        logging.error(f"Critical: Token index {current_byte_abs_idx} out of bounds for labeling.")
+                global_token_idx += num_bytes_in_char
+            else:
+                # Fallback: Could not form a character starting at global_token_idx.
+                # Treat the current byte as a standalone problematic byte.
+                current_byte_abs_idx = global_token_idx
+                label = token_to_patch_label[current_byte_abs_idx] if current_byte_abs_idx < len(token_to_patch_label) else "Error: Label Missing"
+                problem_byte_id = all_token_ids[current_byte_abs_idx]
+                display_text = f"err_byte({problem_byte_id})-1"
+                # Attempt to get a direct representation if tokenizer can provide one for the single byte
+                try:
+                    single_byte_char_attempt = self.tokenizer.decode([problem_byte_id])
+                    if single_byte_char_attempt and single_byte_char_attempt != "\ufffd": # Replacement char
+                         display_text = f"{single_byte_char_attempt}-1"
+                except Exception:
+                    pass # Stick with the err_byte display_text
+                highlighted_data.append((display_text.replace(" ", "_"), label))
+                logging.warning(
+                    f"Token ID {problem_byte_id} at index {current_byte_abs_idx} "
+                    f"could not be part of a validly decoded character using iterative decode. Fallback: '{display_text}'."
+                )
+                global_token_idx += 1
+        return highlighted_data, actual_patch_count
     def process(self, prompt: str, max_bytes: float) -> Tuple[Optional[matplotlib.figure.Figure], List[Tuple[str, str]], int, str]:
         """Processes the prompt using the loaded Bytelatent model."""
         # Run Bytelatent patching
         try:
             logging.info(f"Running Bytelatent entropy model patching on {len(prompt_bl.encode('utf-8'))} bytes...")
+            results = patcher_nocache(
+                [prompt_bl],
+                tokenizer=self.tokenizer,
+                patcher=self.patcher,
+                max_prompt_len=512,
+                max_gen_len=256,
+            )
             status += "Bytelatent patching executed.\n"
             if not results:
             patch_lengths, scores, tokens = batch_patch_lengths[0], batch_scores[0], batch_tokens[0]
             # Create highlighted text data
+            _highlighted_data, patch_count = self._create_highlight_data(patch_lengths, tokens)
+            ind_highlighted_data = [(text.replace("-1", ""), label) for text, label in _highlighted_data]
+            grouped_data = defaultdict(str)
+            for text, label in ind_highlighted_data:
+                grouped_data[label] += text
+            highlighted_data = [(text, label) for label, text in grouped_data.items()]
             # Create plot
             fig = None
                     logging.warning(f"Error decoding full BLT token sequence for plot: {decode_err}. Using (truncated) input prompt for plot axis.")
                     decoded_output_for_plot = prompt_bl
+                # fig = plot_entropies(patch_lengths, scores, decoded_output_for_plot, threshold=self.patcher.threshold)
+                fig = plot_entropies(
+                    patch_lengths,
+                    scores,
+                    tokens,
+                    chars=decoded_output_for_plot,
+                    threshold=self.patcher.threshold
+                )
                 status += f"Bytelatent plot generated. Found {patch_count} patches.\n"
             else:
                  status += "Plotting unavailable.\n"
                 placeholder="Enter text here...",
                 # Max length is for UI input; Bytelatent truncation happens in backend
                 lines=5,
+                info=f"Note: Entropy-based Patcher processing is limited to {Config.BLT_MAX_BYTES_FOR_DEMO} bytes for this demo."
             )
             submit_button = gr.Button("Generate Visualizations", variant="primary")
             status_output = gr.Textbox(label="Processing Status", interactive=False, lines=10) # More space for detailed status

bytelatent/plotting/entropy_figure_via_matplot_lib.py CHANGED Viewed

@@ -1,73 +1,123 @@
-import os
 import torch
-import matplotlib.pyplot as plt
 import numpy as np
-def plot_entropies(patch_lengths: torch.Tensor, scores: torch.Tensor, chars: str, threshold: float):
     patch_lengths_np = patch_lengths.cpu().numpy().flatten()
     scores_np = scores.cpu().float().numpy().flatten()
-    chars = chars.replace(" ", "_")
-    tokens_np = np.array([char for char in "<"+chars])
-    if len(scores_np) != len(tokens_np):
-        raise ValueError("Length of scores and tokens tensors must be the same.")
-    if patch_lengths_np.sum() != len(tokens_np):
-        raise ValueError(f"Sum of patch_lengths ({patch_lengths_np.sum()}) "
-                        f"does not match the length of tokens/scores ({len(tokens_np)}).")
-    x_indices = np.arange(len(tokens_np))
-    # Calculate cumulative sums of patch lengths for vertical line positions
-    # These indicate the *end* index of each patch
-    patch_boundaries = np.cumsum(patch_lengths_np)
-    # --- Plotting ---
-    fig, ax = plt.subplots(figsize=(15, 5)) # Adjust figure size as needed
-    # Plot the scores as a blue line with markers
-    ax.plot(x_indices, scores_np, marker='.', linestyle='-', color='steelblue', label='Scores')
-    # Plot the vertical dotted lines at the patch boundaries
-    # We plot a line *after* each patch, so at index `boundary - 1 + 0.5`
-    # We skip the last boundary as it's the end of the data
-    for boundary in patch_boundaries[:-1]:
-        ax.axvline(x=boundary, color='grey', linestyle='--', linewidth=1)
-    ax.axhline(y=threshold, color='red', linestyle='--', linewidth=1, )
-    ax.annotate(f'Entropy Threshold',
-        xy=(0.05, threshold),  # Position of the line
-        xytext=(0.05, threshold + 0.1),  # Text position
-        xycoords='axes fraction',  # Use axes coordinates (0-1)
-        textcoords='data',  # Use data coordinates for text
         color='red'
     )
-    # Set x-axis ticks and labels
-    ax.set_xticks(x_indices)
-    ax.set_xticklabels(tokens_np, rotation=0, fontsize=8) # Rotate labels for better readability
-    # Set labels for axes
-    # Using the Y-axis label from the example image
-    ax.set_ylabel("Entropy of Next Byte", fontsize=12)
-    ax.set_xlabel("Tokens", fontsize=12)
-    # Set y-axis limits (optional, but often good practice)
-    ax.set_ylim(bottom=0) # Start y-axis at 0 like the example
-    ax.set_xlim(left = x_indices[0]-1.0, right = x_indices[-1]+1.0) # Add padding to x-axis
-    # Add grid lines (optional)
-    # ax.grid(True, axis='y', linestyle=':', color='lightgrey')
-    # Remove the top and right spines for cleaner look (optional)
     ax.spines['top'].set_visible(False)
     ax.spines['right'].set_visible(False)
-    # Adjust layout and display the plot
     plt.tight_layout()
     return fig
-    #  output_filename = "token_score_plot.png"
-    #  fig.savefig(output_filename, dpi=300, bbox_inches='tight') # Save the figure
-    #  print(f"Plot saved to {os.path.abspath(output_filename)}") # Print confirmation with full path

 import torch
 import numpy as np
+import matplotlib.pyplot as plt
+import os
+def plot_entropies( # Renamed from plot_entropies_revised for final output
+    patch_lengths: torch.Tensor,
+    scores: torch.Tensor,
+    tokens: torch.Tensor, # Length used via scores. Content implicitly for UTF-8 assumption.
+    chars: str,
+    threshold: float
+):
     patch_lengths_np = patch_lengths.cpu().numpy().flatten()
     scores_np = scores.cpu().float().numpy().flatten()
+    num_total_bytes_from_scores = len(scores_np)
+    # Prepare display string (prepend '<', replace spaces with '_')
+    display_string_processed_chars = chars.replace(" ", "_")
+    display_string = "<" + display_string_processed_chars
+    display_chars_list = list(display_string)
+    num_display_chars = len(display_chars_list)
+    if num_display_chars == 0 and num_total_bytes_from_scores == 0:
+        fig, ax = plt.subplots(figsize=(15,5))
+        ax.text(0.5, 0.5, "No data to plot.", ha='center', va='center', fontsize=12)
+        ax.set_xlabel("Characters (on underlying byte sequence)")
+        ax.set_ylabel("Entropy of Next Byte")
+        ax.set_ylim(bottom=0)
+        ax.set_xlim(left = -0.5, right = 0.5) # Default xlim for empty plot
+        return fig
+    elif num_display_chars == 0 and num_total_bytes_from_scores > 0:
+        # Edge case: scores exist but no characters to map them to (implies an issue)
+        # For now, proceed with byte plot but no char labels. Or raise error.
+        # Assuming display_chars_list should not be empty if scores_np is not.
+        # This case should ideally be caught by byte_counts_per_display_char validation if it were run.
+        # If display_chars_list is truly empty but scores are not, an error should be raised by validation.
+         pass # Will be caught by validation if sum(byte_counts) != len(scores)
+    # Calculate byte counts for each character in the display string (assuming UTF-8)
+    try:
+        byte_counts_per_display_char = [len(c.encode('utf-8')) for c in display_chars_list]
+    except UnicodeEncodeError as e:
+        raise ValueError(
+            f"Could not encode characters in 'chars' string using UTF-8. "
+            f"Problematic part: '{display_string_processed_chars}'. Error: {e}"
+        )
+    # --- Validations ---
+    if sum(byte_counts_per_display_char) != num_total_bytes_from_scores:
+        # This condition also handles num_display_chars == 0 but num_total_bytes_from_scores > 0
+        raise ValueError(
+            f"Mismatch in byte counts: Sum of UTF-8 bytes for display_string "
+            f"('{display_string}' -> {sum(byte_counts_per_display_char)} bytes) "
+            f"does not match length of scores tensor ({num_total_bytes_from_scores}). "
+            f"Ensure 'chars' (and the prepended '<') correctly correspond to the byte sequence "
+            f"represented by 'scores'/'tokens'."
+        )
+    if patch_lengths_np.sum() != num_total_bytes_from_scores:
+        raise ValueError(
+            f"Sum of patch_lengths ({patch_lengths_np.sum()}) "
+            f"does not match length of scores ({num_total_bytes_from_scores})."
+        )
+    # --- Plotting Setup ---
+    fig, ax = plt.subplots(figsize=(15, 5)) # Fixed size as requested
+    x_byte_indices = np.arange(num_total_bytes_from_scores)
+    # --- Plot Scores (Horizontally per byte) ---
+    # Original plot line style from user's code: marker='.', linestyle='-'
+    ax.plot(x_byte_indices, scores_np, marker='.', linestyle='-', color='steelblue', label='Scores per byte')
+    # --- Plot Vertical Patch Boundary Lines ---
+    # Using (cumulative_length - 0.5) logic for lines between byte elements.
+    # This matches the intent of `boundary - 1 + 0.5` from user's original code snippet.
+    patch_end_byte_cumulative_lengths = np.cumsum(patch_lengths_np)
+    for boundary_len in patch_end_byte_cumulative_lengths[:-1]: # Exclude the last boundary (end of all data)
+        ax.axvline(x=boundary_len, color='grey', linestyle='--', linewidth=1)
+    # --- Horizontal Threshold Line and Annotation ---
+    ax.axhline(y=threshold, color='red', linestyle='--', linewidth=1)
+    ax.annotate(f'Entropy Threshold', # Original text from user's code
+        xy=(0.05, threshold),         # Original xy from user's code
+        xytext=(0.05, threshold + 0.1),# Original xytext from user's code
+        xycoords='axes fraction',      # Original xycoords
+        textcoords='data',             # Original textcoords
         color='red'
     )
+    # --- X-axis Ticks and Labels (Character labels at start of their byte sequences) ---
+    char_label_positions = []
+    char_labels_for_ticks = []
+    current_byte_tracker = 0
+    if num_display_chars > 0 : # Ensure byte_counts_per_display_char is not empty
+        for i_char in range(num_display_chars):
+            char_label_positions.append(current_byte_tracker)
+            char_labels_for_ticks.append(display_chars_list[i_char])
+            current_byte_tracker += byte_counts_per_display_char[i_char]
+    ax.set_xticks(char_label_positions)
+    ax.set_xticklabels(char_labels_for_ticks, rotation=0, fontsize=8) # User's original rotation and fontsize
+    # --- Axes Configuration ---
+    ax.set_ylabel("Entropy of Next Byte", fontsize=12) # User's original
+    ax.set_xlabel("Characters (on underlying byte sequence)", fontsize=12) # Descriptive X-axis label
+    ax.set_ylim(bottom=0) # User's original y-axis bottom limit
+    # Set x-axis limits to show all bytes clearly from -0.5 to last_byte_idx + 0.5
+    if num_total_bytes_from_scores > 0:
+        ax.set_xlim(left=-0.5, right=num_total_bytes_from_scores - 0.5)
+    else: # Handle case of no bytes (e.g. if chars was empty and scores was empty)
+        ax.set_xlim(left=-0.5, right=0.5)
+    # Spines (as per user's original code removing top and right)
     ax.spines['top'].set_visible(False)
     ax.spines['right'].set_visible(False)
+    # Grid: User's original code did not explicitly add grid lines.
     plt.tight_layout()
     return fig