Spaces:

lucalp
/

blt-entropy-patcher

Running on Zero

App Files Files Community

lucalp commited on Apr 28

Commit

41ea791

1 Parent(s): 1b67cbe

Visualisation working on CPU via CUDA_VISIBLE_DEVICE=-1 python demo_patcher.py 'Daenerys Targaryen is in Game of Thrones, a fantasy epic by George R.R. Martin.'

Browse files

Files changed (4) hide show

bytelatent/entropy_model.py +2 -2
bytelatent/generate_patcher.py +130 -0
bytelatent/plotting/entropy_figure_via_matplot_lib.py +67 -0
demo_patcher.py +49 -0

bytelatent/entropy_model.py CHANGED Viewed

@@ -27,8 +27,8 @@ def load_entropy_model(entropy_model_checkpoint_dir, state_dict_path, device="cp
             max_seqlen=model_params["max_seqlen"],
             ffn_dim_multiplier=model_params["ffn_dim_multiplier"],
             vocab_size=model_params["vocab_size"],
-            attn_bias_type="local_block_causal",
-            attn_impl="xformers",
             sliding_window=512,
         )
     )

             max_seqlen=model_params["max_seqlen"],
             ffn_dim_multiplier=model_params["ffn_dim_multiplier"],
             vocab_size=model_params["vocab_size"],
+            attn_bias_type="causal",
+            attn_impl="sdpa",
             sliding_window=512,
         )
     )

bytelatent/generate_patcher.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import logging
+import os
+from typing import Tuple
+import torch
+from bytelatent.args import EvalArgs
+from bytelatent.config_parser import parse_args_to_pydantic_model
+from bytelatent.data.file_util import get_fs
+from bytelatent.data.patcher import Patcher
+from bytelatent.distributed import (
+    DistributedArgs,
+    dist_max,
+    dist_min,
+    dist_sum,
+    get_device_mesh,
+    setup_torch_distributed,
+)
+from bytelatent.generate import load_consolidated_model_and_tokenizer
+from bytelatent.model.blt import ByteLatentTransformer
+from bytelatent.tokenizers.blt_tokenizer import BltTokenizer
+logger = logging.getLogger()
+def get_max_length(input_tokens: list[list[int]] | None) -> int:
+    # reduce max length prompt over all processes to have an equal number of call on each process with fsdp
+    if input_tokens is None:
+        max_length = 0
+    else:
+        max_length = max([len(t) for t in input_tokens])
+    if torch.distributed.is_initialized():
+        max_length = int(dist_max(max_length))
+    return max_length
+def get_min_length(input_tokens: list[list[int]] | None) -> int:
+    # reduce min length prompt over all processes to have an equal number of call on each process with fsdp
+    if input_tokens is None:
+        # TODO: Double check this change from int(1e9) is correct
+        min_length = 0
+    else:
+        min_length = min([len(t) for t in input_tokens])
+    if torch.distributed.is_initialized():
+        min_length = int(dist_min(min_length))
+    return min_length
+def get_generation_range(
+    prompt_tokens: list[list[int]] | None, max_gen_len: int
+) -> tuple[int, int]:
+    batch_min_prompt_length = get_min_length(prompt_tokens)
+    batch_max_prompt_length = get_max_length(prompt_tokens)
+    return batch_min_prompt_length, batch_max_prompt_length + max_gen_len
+def sample_top_k(probs, k):
+    topk_value, _ = torch.topk(probs, k)  # batch_sz x topk
+    min_value_top_k = topk_value[:, [-1]]
+    probs[probs < min_value_top_k] = 0.0
+    probs.div_(probs.sum(dim=-1, keepdim=True))
+    next_token = torch.multinomial(probs, num_samples=1)
+    return next_token
+def sample_top_p(probs, p):
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort > p
+    probs_sort[mask] = 0.0
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = torch.multinomial(probs_sort, num_samples=1)
+    next_token = torch.gather(probs_idx, -1, next_token)
+    return next_token
+@torch.inference_mode()
+def patcher_nocache(
+    prompts: list[str] | None,
+    *,
+    tokenizer: BltTokenizer,
+    patcher: Patcher,
+    max_prompt_len: int = 256,
+    max_gen_len: int = 256,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None:
+    assert (
+        patcher.realtime_patching
+    ), "generate_nocache requires patcher.realtime_patching=True"
+    if prompts is None:
+        prompt_tokens = None
+        n_truncated_prompts = 0
+        total_truncated_prompts = 0
+    else:
+        prompt_tokens = [tokenizer.encode(t, add_eos=False) for t in prompts]
+        n_truncated_prompts = sum([max_prompt_len < len(t) for t in prompt_tokens])
+        if torch.distributed.is_initialized():
+            total_truncated_prompts = dist_sum(n_truncated_prompts)
+        else:
+            total_truncated_prompts = n_truncated_prompts
+        # Truncation
+        prompt_tokens = [
+            t if len(t) < max_prompt_len else t[len(t) - max_prompt_len :]
+            for t in prompt_tokens
+        ]
+    if total_truncated_prompts > 0:
+        logger.info(
+            f"There are {total_truncated_prompts} prompts that are truncated on the left, "
+            f"length greater than max_prompt_len = {max_prompt_len}, "
+            f"maximum prompt length = {get_max_length(prompt_tokens)} across all gpus."
+        )
+    if prompt_tokens is None:
+        prompt_tokens = [[tokenizer.bos_id] for _ in range(end_pos)]
+    start_pos, end_pos = get_generation_range(prompt_tokens, max_gen_len)
+    batch_size = len(prompt_tokens)
+    tokens = torch.full((batch_size, end_pos), tokenizer.pad_id).to(patcher.device).long()
+    # Copy inputs to tensor for generated tokens
+    for i, row_tokens in enumerate(prompt_tokens):
+        tokens[i, : len(row_tokens)] = torch.tensor(row_tokens).long()
+    for i, curr_pos in enumerate(range(start_pos, end_pos)):
+        current_tokens = tokens[:, :curr_pos]
+        patch_lengths, scores = patcher.patch(current_tokens, include_next_token=False)
+        # insta return since not generating t+1
+        return patch_lengths, scores, current_tokens
+    return None

bytelatent/plotting/entropy_figure_via_matplot_lib.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import torch
+import matplotlib.pyplot as plt
+import numpy as np
+def plot_entropies(patch_lengths: torch.Tensor, scores: torch.Tensor, chars: str, threshold: float):
+    patch_lengths_np = patch_lengths.cpu().numpy().flatten()
+    scores_np = scores.cpu().float().numpy().flatten()
+    chars = chars.replace(" ", "_")
+    tokens_np = np.array([char for char in "<"+chars])
+    if len(scores_np) != len(tokens_np):
+        raise ValueError("Length of scores and tokens tensors must be the same.")
+    if patch_lengths_np.sum() != len(tokens_np):
+        raise ValueError(f"Sum of patch_lengths ({patch_lengths_np.sum()}) "
+                        f"does not match the length of tokens/scores ({len(tokens_np)}).")
+    x_indices = np.arange(len(tokens_np))
+    # Calculate cumulative sums of patch lengths for vertical line positions
+    # These indicate the *end* index of each patch
+    patch_boundaries = np.cumsum(patch_lengths_np)
+    # --- Plotting ---
+    fig, ax = plt.subplots(figsize=(15, 5)) # Adjust figure size as needed
+    # Plot the scores as a blue line with markers
+    ax.plot(x_indices, scores_np, marker='.', linestyle='-', color='steelblue', label='Scores')
+    # Plot the vertical dotted lines at the patch boundaries
+    # We plot a line *after* each patch, so at index `boundary - 1 + 0.5`
+    # We skip the last boundary as it's the end of the data
+    for boundary in patch_boundaries[:-1]:
+        ax.axvline(x=boundary, color='grey', linestyle='--', linewidth=1)
+    ax.axhline(y=threshold, color='red', linestyle='--', linewidth=1)
+    # Set x-axis ticks and labels
+    ax.set_xticks(x_indices)
+    ax.set_xticklabels(tokens_np, rotation=0, fontsize=8) # Rotate labels for better readability
+    # Set labels for axes
+    # Using the Y-axis label from the example image
+    ax.set_ylabel("Entropy of Next Byte", fontsize=12)
+    ax.set_xlabel("Tokens", fontsize=12)
+    # Set y-axis limits (optional, but often good practice)
+    ax.set_ylim(bottom=0) # Start y-axis at 0 like the example
+    ax.set_xlim(left = x_indices[0]-1.0, right = x_indices[-1]+1.0) # Add padding to x-axis
+    # Add grid lines (optional)
+    # ax.grid(True, axis='y', linestyle=':', color='lightgrey')
+    # Remove the top and right spines for cleaner look (optional)
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    # Adjust layout and display the plot
+    plt.tight_layout()
+    output_filename = "token_score_plot.png"
+    fig.savefig(output_filename, dpi=300, bbox_inches='tight') # Save the figure
+    print(f"Plot saved to {os.path.abspath(output_filename)}") # Print confirmation with full path
+    # Close the plot figure to free memory (good practice)
+    plt.close(fig)

demo_patcher.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+import torch
+import typer
+from bytelatent.data.file_util import get_fs
+from bytelatent.distributed import DistributedArgs, setup_torch_distributed
+from bytelatent.generate_patcher import patcher_nocache
+from bytelatent.tokenizers.blt_tokenizer import BltTokenizer
+from bytelatent.plotting.entropy_figure_via_matplot_lib import plot_entropies
+def main(prompt: str, model_name: str = "blt-1b"):
+    from bytelatent.args import TrainArgs
+    consolidated_path = os.path.join("hf-weights", model_name)
+    train_args_path = os.path.join(consolidated_path, "params.json")
+    fs = get_fs(train_args_path)
+    train_args = TrainArgs.model_validate_json(fs.read_text(train_args_path))
+    tokenizer = train_args.data.tokenizer_args.build()
+    assert isinstance(tokenizer, BltTokenizer)
+    patcher_args = train_args.data.patcher_args.model_copy(deep=True)
+    patcher_args.realtime_patching = True
+    #  NOTE: CPU currently unsupported due to reliance of xformers
+    patcher_args.patching_device = "cpu"
+    patcher_args.device = "cpu"
+    print("Loading entropy model and patcher")
+    patcher_args.entropy_model_checkpoint_dir = os.path.join(
+        consolidated_path, "entropy_model"
+    )
+    patcher = patcher_args.build()
+    prompts = [prompt]
+    results = patcher_nocache(
+        prompts, tokenizer=tokenizer, patcher=patcher
+    )
+    if not results:
+        raise Exception("Ruh roh")
+    batch_patch_lengths, batch_scores, batch_tokens = results
+    decoded_chars = [tokenizer.decode(row_tokens.tolist()) for row_tokens in batch_tokens]
+    plot_entropies(
+        batch_patch_lengths[0],
+        batch_scores[0],
+        decoded_chars[0],
+        threshold=patcher.threshold
+    )
+if __name__ == "__main__":
+    typer.run(main)