Spaces:
Running
Running
# /// script | |
# [tool.marimo.runtime] | |
# auto_instantiate = false | |
# /// | |
import marimo | |
__generated_with = "0.13.0" | |
app = marimo.App(width="medium") | |
def _(): | |
import hashlib | |
import math | |
import altair as alt | |
import marimo as mo | |
import polars as pl | |
import spacy | |
from transformers import AutoTokenizer | |
# Load spaCy models for English and Japanese | |
nlp_en = spacy.load("en_core_web_md") | |
nlp_ja = spacy.load("ja_core_news_md") | |
# List of tokenizer models | |
llm_model_choices = [ | |
"meta-llama/Llama-4-Scout-17B-16E-Instruct", | |
"google/gemma-3-27b-it", | |
"deepseek-ai/DeepSeek-R1", | |
"mistralai/Mistral-Small-3.1-24B-Instruct-2503", | |
"Qwen/Qwen2.5-72B-Instruct", | |
"google-bert/bert-large-uncased", | |
"openai-community/gpt2", | |
] | |
return ( | |
AutoTokenizer, | |
alt, | |
hashlib, | |
llm_model_choices, | |
math, | |
mo, | |
nlp_en, | |
nlp_ja, | |
pl, | |
) | |
def _(mo): | |
mo.md("# Tokenization for English and Japanese") | |
return | |
def _(mo): | |
# Central state for the text input content | |
get_text_content, set_text_content = mo.state("") | |
return get_text_content, set_text_content | |
def _(mo): | |
# Placeholder texts | |
en_placeholder = """ | |
Mrs. Ferrars died on the night of the 16th–17th September—a Thursday. I was sent for at eight o’clock on the morning of Friday the 17th. There was nothing to be done. She had been dead some hours. | |
""".strip() | |
ja_placeholder = """ | |
吾輩は猫である。名前はまだ無い。 | |
どこで生れたかとんと見当がつかぬ。何でも薄暗いじめじめした所でニャーニャー泣いていた事だけは記憶している。 | |
""".strip() | |
# Create UI element for language selection | |
language_selector = mo.ui.radio( | |
options=["English", "Japanese"], value="English", label="Language" | |
) | |
# Return selector and placeholders | |
return en_placeholder, ja_placeholder, language_selector | |
def _( | |
en_placeholder, | |
get_text_content, | |
ja_placeholder, | |
language_selector, | |
mo, | |
set_text_content, | |
): | |
# Define text_input dynamically based on language | |
current_placeholder = ( | |
en_placeholder if language_selector.value == "English" else ja_placeholder | |
) | |
text_input = mo.ui.text_area( | |
# Read value from state | |
value=get_text_content(), | |
label="Enter text", | |
placeholder=current_placeholder, | |
full_width=True, | |
# Update state on user input | |
on_change=lambda v: set_text_content(v), | |
) | |
return current_placeholder, text_input | |
def _(current_placeholder, mo, set_text_content): | |
def apply_placeholder(): | |
set_text_content(current_placeholder) | |
apply_placeholder_button = mo.ui.button( | |
label="Use Placeholder Text", on_click=lambda _: apply_placeholder() | |
) | |
return (apply_placeholder_button,) | |
def _(apply_placeholder_button, language_selector, mo, text_input): | |
mo.vstack( | |
[ | |
text_input, | |
mo.hstack([language_selector, apply_placeholder_button], justify="start"), | |
] | |
) | |
return | |
def _(get_text_content, language_selector, mo, nlp_en, nlp_ja): | |
# Analyze text using spaCy based on selected language | |
# Read text from state | |
current_text = get_text_content() | |
if language_selector.value == "English": | |
doc = nlp_en(current_text) | |
else: | |
doc = nlp_ja(current_text) | |
# Tokenized version and count | |
tokenized_text = [token.text for token in doc] | |
token_count = len(tokenized_text) | |
mo.md( | |
f"**Tokenized Text:** {' | '.join(tokenized_text)}\n\n**Token Count:** {token_count}" | |
) | |
return current_text, doc | |
def _(doc, mo, pl): | |
# Create a polars DataFrame with token attributes | |
token_data = pl.DataFrame( | |
{ | |
"Token": [token.text for token in doc], | |
"Lemma": [token.lemma_ for token in doc], | |
"POS": [token.pos_ for token in doc], | |
"Tag": [token.tag_ for token in doc], | |
"Morph": [ | |
str(token.morph) for token in doc | |
], # To be more precise, this should be merged back in via .to_dict() | |
"Token Position": list(range(len(doc))), | |
"Sentence Number": [ | |
i for i, sent in enumerate(doc.sents) for token in sent | |
], | |
} | |
) | |
mo.ui.dataframe(token_data, page_size=50) | |
return (token_data,) | |
def _(mo): | |
# Create UI element for selecting the column to visualize | |
column_selector = mo.ui.dropdown( | |
options=["POS", "Tag", "Lemma", "Token", "Morph"], | |
value="POS", | |
label="Select column to visualize", | |
) | |
column_selector | |
return (column_selector,) | |
def _(alt, column_selector, mo, token_data): | |
mo.stop(token_data.is_empty(), "Please set input text.") | |
selected_column = column_selector.value | |
# Calculate value counts for the selected column | |
counts_df = ( | |
token_data[selected_column] | |
.value_counts() | |
.sort(by=["count", selected_column], descending=[True, False]) | |
) | |
chart = ( | |
alt.Chart(counts_df) | |
.mark_bar() | |
.encode( | |
x=alt.X("count", title="Frequency"), | |
y=alt.Y(selected_column, title=selected_column, sort=None), | |
tooltip=[selected_column, "count"], | |
) | |
.properties(title=f"{selected_column} Distribution") | |
.interactive() | |
) | |
mo.ui.altair_chart(chart) | |
return | |
def _(llm_model_choices, mo): | |
# UI for selecting the LLM tokenizer model | |
llm_tokenizer_selector = mo.ui.dropdown( | |
options=llm_model_choices, | |
value=llm_model_choices[-1], # Default to gpt2 for faster loading initially | |
label="Select LLM Tokenizer Model", | |
) | |
llm_tokenizer_selector | |
return (llm_tokenizer_selector,) | |
def _(AutoTokenizer, llm_tokenizer_selector): | |
# Load the selected tokenizer | |
# Adapted code from: https://huggingface.co/spaces/barttee/tokenizers/blob/main/app.py | |
# This cell will re-run when llm_tokenizer_selector.value changes | |
# Marimo caches the result implicitly based on inputs | |
selected_model_name = llm_tokenizer_selector.value | |
tokenizer = AutoTokenizer.from_pretrained(selected_model_name) | |
return (tokenizer,) | |
def _(math): | |
# Function to calculate token statistics | |
def get_token_stats(tokens: list, original_text: str) -> dict: | |
"""Calculate enhanced statistics about the tokens.""" | |
if not tokens: | |
return { # Return default structure even for empty input | |
"basic_stats": { | |
"total_tokens": 0, | |
"unique_tokens": 0, | |
"compression_ratio": 0, | |
"space_tokens": 0, | |
"newline_tokens": 0, | |
"special_tokens": 0, | |
"punctuation_tokens": 0, | |
"unique_percentage": 0, | |
}, | |
"length_stats": { | |
"avg_length": 0, | |
"std_dev": 0, | |
"min_length": 0, | |
"max_length": 0, | |
"median_length": 0, | |
}, | |
} | |
total_tokens = len(tokens) | |
unique_tokens = len(set(tokens)) | |
# Handle potential division by zero if total_tokens is 0 (already checked by `if not tokens`) | |
avg_length = ( | |
sum(len(t) for t in tokens) / total_tokens if total_tokens > 0 else 0 | |
) | |
# Handle potential division by zero if total_tokens is 0 | |
compression_ratio = len(original_text) / total_tokens if total_tokens > 0 else 0 | |
# Token type analysis (Note: Heuristics might vary between tokenizers) | |
# Using startswith(('Ġ', ' ')) covers common space markers like SentencePiece's U+2581 and BPE's 'Ġ' | |
space_tokens = sum(1 for t in tokens if t.startswith(("Ġ", " "))) | |
# Check for common newline representations | |
newline_tokens = sum( | |
1 for t in tokens if "Ċ" in t or t == "\n" or t == "<0x0A>" | |
) | |
# A broader definition for special tokens based on common patterns (control tokens) | |
special_tokens = sum( | |
1 | |
for t in tokens | |
if (t.startswith("<") and t.endswith(">")) | |
or (t.startswith("[") and t.endswith("]")) | |
) | |
# Simple punctuation check (might overlap with other categories, focuses on single char punct) | |
punctuation_tokens = sum( | |
1 | |
for t in tokens | |
if len(t) == 1 and not t.isalnum() and t not in [" ", "\n", "Ġ", "Ċ"] | |
) | |
# Length distribution | |
lengths = [len(t) for t in tokens] | |
if not lengths: # Should not happen if tokens is not empty, but safe check | |
return { | |
"basic_stats": { | |
"total_tokens": 0, | |
"unique_tokens": 0, | |
"compression_ratio": 0, | |
"space_tokens": 0, | |
"newline_tokens": 0, | |
"special_tokens": 0, | |
"punctuation_tokens": 0, | |
"unique_percentage": 0, | |
}, | |
"length_stats": { | |
"avg_length": 0, | |
"std_dev": 0, | |
"min_length": 0, | |
"max_length": 0, | |
"median_length": 0, | |
}, | |
} | |
mean_length = sum(lengths) / len(lengths) | |
variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths) | |
std_dev = math.sqrt(variance) | |
sorted_lengths = sorted(lengths) | |
# Handle case where lengths list might be empty after filtering, though unlikely here | |
median_length = sorted_lengths[len(lengths) // 2] if lengths else 0 | |
return { | |
"basic_stats": { | |
"total_tokens": total_tokens, | |
"unique_tokens": unique_tokens, | |
"compression_ratio": round(compression_ratio, 2), | |
"space_tokens": space_tokens, | |
"newline_tokens": newline_tokens, | |
"special_tokens": special_tokens, | |
"punctuation_tokens": punctuation_tokens, | |
"unique_percentage": round(unique_tokens / total_tokens * 100, 1) | |
if total_tokens > 0 | |
else 0, | |
}, | |
"length_stats": { | |
"avg_length": round(avg_length, 2), | |
"std_dev": round(std_dev, 2), | |
"min_length": min(lengths) if lengths else 0, | |
"max_length": max(lengths) if lengths else 0, | |
"median_length": median_length, | |
}, | |
} | |
return (get_token_stats,) | |
def _(hashlib): | |
def get_varied_color(token: str) -> dict: | |
"""Generate vibrant colors with HSL for better visual distinction.""" | |
# Use a fixed salt or seed if you want consistent colors across runs for the same token | |
token_hash = hashlib.md5(token.encode()).hexdigest() | |
hue = int(token_hash[:3], 16) % 360 | |
saturation = 70 + (int(token_hash[3:5], 16) % 20) # Saturation between 70-90% | |
lightness = 80 + ( | |
int(token_hash[5:7], 16) % 10 | |
) # Lightness between 80-90% (light background) | |
# Ensure text color contrasts well with the light background | |
text_lightness = 20 # Dark text for light background | |
return { | |
"background": f"hsl({hue}, {saturation}%, {lightness}%)", | |
"text": f"hsl({hue}, {saturation}%, {text_lightness}%)", | |
} | |
return (get_varied_color,) | |
def fix_token(token: str) -> str: | |
"""Fix token for display with improved space visualization.""" | |
# Replace SentencePiece space marker U+2581 with a middle dot | |
token = token.replace(" ", "·") | |
# Replace BPE space marker 'Ġ' with a middle dot | |
if token.startswith("Ġ"): | |
space_count = token.count("Ġ") | |
return "·" * space_count + token[space_count:] | |
# Replace newline markers for display | |
token = token.replace( | |
"Ċ", "↵\n" | |
) # Replace newline marker with symbol and actual newline | |
token = token.replace("<0x0A>", "↵\n") # Handle byte representation of newline | |
return token | |
def get_tokenizer_info(tokenizer): | |
""" | |
Extract useful information from a tokenizer. | |
Returns a dictionary with tokenizer details. | |
""" | |
info = {} | |
try: | |
# Get vocabulary size (dictionary size) | |
if hasattr(tokenizer, "vocab_size"): | |
info["vocab_size"] = tokenizer.vocab_size | |
elif hasattr(tokenizer, "get_vocab"): | |
info["vocab_size"] = len(tokenizer.get_vocab()) | |
# Get model max length if available | |
if ( | |
hasattr(tokenizer, "model_max_length") | |
and tokenizer.model_max_length < 1000000 | |
): # Sanity check for realistic values | |
info["model_max_length"] = tokenizer.model_max_length | |
else: | |
info["model_max_length"] = "Not specified or very large" | |
# Check tokenizer type | |
info["tokenizer_type"] = tokenizer.__class__.__name__ | |
# Get special tokens using the recommended attributes/methods | |
special_tokens = {} | |
# Prefer all_special_tokens if available | |
if hasattr(tokenizer, "all_special_tokens"): | |
for token in tokenizer.all_special_tokens: | |
# Try to find the attribute name corresponding to the token value | |
token_name = "unknown_special_token" # Default name | |
for attr_name in [ | |
"pad_token", | |
"eos_token", | |
"bos_token", | |
"sep_token", | |
"cls_token", | |
"unk_token", | |
"mask_token", | |
]: | |
if ( | |
hasattr(tokenizer, attr_name) | |
and getattr(tokenizer, attr_name) == token | |
): | |
token_name = attr_name | |
break | |
if token and str(token).strip(): | |
special_tokens[token_name] = str(token) | |
else: | |
# Fallback to checking individual attributes | |
for token_name in [ | |
"pad_token", | |
"eos_token", | |
"bos_token", | |
"sep_token", | |
"cls_token", | |
"unk_token", | |
"mask_token", | |
]: | |
if ( | |
hasattr(tokenizer, token_name) | |
and getattr(tokenizer, token_name) is not None | |
): | |
token_value = getattr(tokenizer, token_name) | |
if token_value and str(token_value).strip(): | |
special_tokens[token_name] = str(token_value) | |
info["special_tokens"] = special_tokens if special_tokens else "None found" | |
except Exception as e: | |
info["error"] = f"Error extracting tokenizer info: {str(e)}" | |
return info | |
def _(mo): | |
show_ids_switch = mo.ui.switch(label="Show Token IDs instead of Text", value=False) | |
return (show_ids_switch,) | |
def _( | |
current_text, | |
get_token_stats, | |
get_varied_color, | |
llm_tokenizer_selector, | |
mo, | |
show_ids_switch, | |
tokenizer, | |
): | |
# --- Tokenization and Data Preparation --- | |
# Get tokenizer metadata | |
tokenizer_info = get_tokenizer_info(tokenizer) | |
# Tokenize the input text | |
# Use tokenize to get string representations for analysis and display | |
all_tokens = tokenizer.tokenize(current_text) | |
total_token_count = len(all_tokens) | |
# Limit the number of tokens for display to avoid browser slowdown | |
display_limit = 1000 | |
display_tokens = all_tokens[:display_limit] | |
display_limit_reached = total_token_count > display_limit | |
# Generate data for visualization | |
llm_token_data = [] | |
for idx, token in enumerate(display_tokens): | |
colors = get_varied_color(token) | |
fixed_token_display = fix_token(token) # Apply fixes for display | |
# Handle potential errors during ID conversion (e.g., unknown tokens if not handled by tokenizer) | |
try: | |
token_id = tokenizer.convert_tokens_to_ids(token) | |
except KeyError: | |
token_id = ( | |
tokenizer.unk_token_id if hasattr(tokenizer, "unk_token_id") else -1 | |
) # Use UNK id or -1 | |
llm_token_data.append( | |
{ | |
"original": token, | |
"display": fixed_token_display, | |
"colors": colors, | |
"is_newline": "↵" | |
in fixed_token_display, # Check if it represents a newline | |
"token_id": token_id, | |
"token_index": idx, | |
} | |
) | |
# Calculate statistics using the full token list | |
token_stats = get_token_stats(all_tokens, current_text) | |
# Construct HTML for colored tokens | |
html_parts = [] | |
for item in llm_token_data: | |
# Use pre-wrap to respect spaces and newlines within the token display | |
style = f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;" | |
# Add title attribute for hover info (original token + ID) | |
title = f"Original: {item['original']}\nID: {item['token_id']}" | |
display_content = ( | |
str(item["token_id"]) if show_ids_switch.value else item["display"] | |
) | |
html_parts.append( | |
f'<span style="{style}" title="{title}">{display_content}</span>' | |
) | |
token_viz_html = mo.Html( | |
f'<div style="line-height: 1.6;">{"".join(html_parts)}</div>' | |
) | |
basic_stats = token_stats["basic_stats"] | |
length_stats = token_stats["length_stats"] | |
basic_stats_md = "**Basic Stats:**\n\n" + "\n".join( | |
f"- **{key.replace('_', ' ').title()}:** `{value}`" | |
for key, value in basic_stats.items() | |
) | |
length_stats_md = "**Length (Character) Stats:**\n\n" + "\n".join( | |
f"- **{key.replace('_', ' ').title()}:** `{value}`" | |
for key, value in length_stats.items() | |
) | |
mo.md(f"""# LLM tokenizer: {llm_tokenizer_selector.value} | |
{show_ids_switch} | |
## Tokenizer output | |
{mo.as_html(token_viz_html)} | |
## Token Statistics | |
{basic_stats_md} | |
{length_stats_md} | |
""") | |
return | |
def _(): | |
return | |
if __name__ == "__main__": | |
app.run() | |