counting_words / app.py
Bor Hodošček
chore: format and add lockfile
fade8c0 unverified
raw
history blame
18.9 kB
# /// script
# [tool.marimo.runtime]
# auto_instantiate = false
# ///
import marimo
__generated_with = "0.13.0"
app = marimo.App(width="medium")
@app.cell
def _():
import hashlib
import math
import altair as alt
import marimo as mo
import polars as pl
import spacy
from transformers import AutoTokenizer
# Load spaCy models for English and Japanese
nlp_en = spacy.load("en_core_web_md")
nlp_ja = spacy.load("ja_core_news_md")
# List of tokenizer models
llm_model_choices = [
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"google/gemma-3-27b-it",
"deepseek-ai/DeepSeek-R1",
"mistralai/Mistral-Small-3.1-24B-Instruct-2503",
"Qwen/Qwen2.5-72B-Instruct",
"google-bert/bert-large-uncased",
"openai-community/gpt2",
]
return (
AutoTokenizer,
alt,
hashlib,
llm_model_choices,
math,
mo,
nlp_en,
nlp_ja,
pl,
)
@app.cell
def _(mo):
mo.md("# Tokenization for English and Japanese")
return
@app.cell
def _(mo):
# Central state for the text input content
get_text_content, set_text_content = mo.state("")
return get_text_content, set_text_content
@app.cell
def _(mo):
# Placeholder texts
en_placeholder = """
Mrs. Ferrars died on the night of the 16th⁠–⁠17th September⁠—a Thursday. I was sent for at eight o’clock on the morning of Friday the 17th. There was nothing to be done. She had been dead some hours.
""".strip()
ja_placeholder = """
吾輩は猫である。名前はまだ無い。
 どこで生れたかとんと見当がつかぬ。何でも薄暗いじめじめした所でニャーニャー泣いていた事だけは記憶している。
""".strip()
# Create UI element for language selection
language_selector = mo.ui.radio(
options=["English", "Japanese"], value="English", label="Language"
)
# Return selector and placeholders
return en_placeholder, ja_placeholder, language_selector
@app.cell
def _(
en_placeholder,
get_text_content,
ja_placeholder,
language_selector,
mo,
set_text_content,
):
# Define text_input dynamically based on language
current_placeholder = (
en_placeholder if language_selector.value == "English" else ja_placeholder
)
text_input = mo.ui.text_area(
# Read value from state
value=get_text_content(),
label="Enter text",
placeholder=current_placeholder,
full_width=True,
# Update state on user input
on_change=lambda v: set_text_content(v),
)
return current_placeholder, text_input
@app.cell
def _(current_placeholder, mo, set_text_content):
def apply_placeholder():
set_text_content(current_placeholder)
apply_placeholder_button = mo.ui.button(
label="Use Placeholder Text", on_click=lambda _: apply_placeholder()
)
return (apply_placeholder_button,)
@app.cell
def _(apply_placeholder_button, language_selector, mo, text_input):
mo.vstack(
[
text_input,
mo.hstack([language_selector, apply_placeholder_button], justify="start"),
]
)
return
@app.cell
def _(get_text_content, language_selector, mo, nlp_en, nlp_ja):
# Analyze text using spaCy based on selected language
# Read text from state
current_text = get_text_content()
if language_selector.value == "English":
doc = nlp_en(current_text)
else:
doc = nlp_ja(current_text)
# Tokenized version and count
tokenized_text = [token.text for token in doc]
token_count = len(tokenized_text)
mo.md(
f"**Tokenized Text:** {' | '.join(tokenized_text)}\n\n**Token Count:** {token_count}"
)
return current_text, doc
@app.cell
def _(doc, mo, pl):
# Create a polars DataFrame with token attributes
token_data = pl.DataFrame(
{
"Token": [token.text for token in doc],
"Lemma": [token.lemma_ for token in doc],
"POS": [token.pos_ for token in doc],
"Tag": [token.tag_ for token in doc],
"Morph": [
str(token.morph) for token in doc
], # To be more precise, this should be merged back in via .to_dict()
"Token Position": list(range(len(doc))),
"Sentence Number": [
i for i, sent in enumerate(doc.sents) for token in sent
],
}
)
mo.ui.dataframe(token_data, page_size=50)
return (token_data,)
@app.cell
def _(mo):
# Create UI element for selecting the column to visualize
column_selector = mo.ui.dropdown(
options=["POS", "Tag", "Lemma", "Token", "Morph"],
value="POS",
label="Select column to visualize",
)
column_selector
return (column_selector,)
@app.cell
def _(alt, column_selector, mo, token_data):
mo.stop(token_data.is_empty(), "Please set input text.")
selected_column = column_selector.value
# Calculate value counts for the selected column
counts_df = (
token_data[selected_column]
.value_counts()
.sort(by=["count", selected_column], descending=[True, False])
)
chart = (
alt.Chart(counts_df)
.mark_bar()
.encode(
x=alt.X("count", title="Frequency"),
y=alt.Y(selected_column, title=selected_column, sort=None),
tooltip=[selected_column, "count"],
)
.properties(title=f"{selected_column} Distribution")
.interactive()
)
mo.ui.altair_chart(chart)
return
@app.cell
def _(llm_model_choices, mo):
# UI for selecting the LLM tokenizer model
llm_tokenizer_selector = mo.ui.dropdown(
options=llm_model_choices,
value=llm_model_choices[-1], # Default to gpt2 for faster loading initially
label="Select LLM Tokenizer Model",
)
llm_tokenizer_selector
return (llm_tokenizer_selector,)
@app.cell
def _(AutoTokenizer, llm_tokenizer_selector):
# Load the selected tokenizer
# Adapted code from: https://huggingface.co/spaces/barttee/tokenizers/blob/main/app.py
# This cell will re-run when llm_tokenizer_selector.value changes
# Marimo caches the result implicitly based on inputs
selected_model_name = llm_tokenizer_selector.value
tokenizer = AutoTokenizer.from_pretrained(selected_model_name)
return (tokenizer,)
@app.cell
def _(math):
# Function to calculate token statistics
def get_token_stats(tokens: list, original_text: str) -> dict:
"""Calculate enhanced statistics about the tokens."""
if not tokens:
return { # Return default structure even for empty input
"basic_stats": {
"total_tokens": 0,
"unique_tokens": 0,
"compression_ratio": 0,
"space_tokens": 0,
"newline_tokens": 0,
"special_tokens": 0,
"punctuation_tokens": 0,
"unique_percentage": 0,
},
"length_stats": {
"avg_length": 0,
"std_dev": 0,
"min_length": 0,
"max_length": 0,
"median_length": 0,
},
}
total_tokens = len(tokens)
unique_tokens = len(set(tokens))
# Handle potential division by zero if total_tokens is 0 (already checked by `if not tokens`)
avg_length = (
sum(len(t) for t in tokens) / total_tokens if total_tokens > 0 else 0
)
# Handle potential division by zero if total_tokens is 0
compression_ratio = len(original_text) / total_tokens if total_tokens > 0 else 0
# Token type analysis (Note: Heuristics might vary between tokenizers)
# Using startswith(('Ġ', ' ')) covers common space markers like SentencePiece's U+2581 and BPE's 'Ġ'
space_tokens = sum(1 for t in tokens if t.startswith(("Ġ", " ")))
# Check for common newline representations
newline_tokens = sum(
1 for t in tokens if "Ċ" in t or t == "\n" or t == "<0x0A>"
)
# A broader definition for special tokens based on common patterns (control tokens)
special_tokens = sum(
1
for t in tokens
if (t.startswith("<") and t.endswith(">"))
or (t.startswith("[") and t.endswith("]"))
)
# Simple punctuation check (might overlap with other categories, focuses on single char punct)
punctuation_tokens = sum(
1
for t in tokens
if len(t) == 1 and not t.isalnum() and t not in [" ", "\n", "Ġ", "Ċ"]
)
# Length distribution
lengths = [len(t) for t in tokens]
if not lengths: # Should not happen if tokens is not empty, but safe check
return {
"basic_stats": {
"total_tokens": 0,
"unique_tokens": 0,
"compression_ratio": 0,
"space_tokens": 0,
"newline_tokens": 0,
"special_tokens": 0,
"punctuation_tokens": 0,
"unique_percentage": 0,
},
"length_stats": {
"avg_length": 0,
"std_dev": 0,
"min_length": 0,
"max_length": 0,
"median_length": 0,
},
}
mean_length = sum(lengths) / len(lengths)
variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
std_dev = math.sqrt(variance)
sorted_lengths = sorted(lengths)
# Handle case where lengths list might be empty after filtering, though unlikely here
median_length = sorted_lengths[len(lengths) // 2] if lengths else 0
return {
"basic_stats": {
"total_tokens": total_tokens,
"unique_tokens": unique_tokens,
"compression_ratio": round(compression_ratio, 2),
"space_tokens": space_tokens,
"newline_tokens": newline_tokens,
"special_tokens": special_tokens,
"punctuation_tokens": punctuation_tokens,
"unique_percentage": round(unique_tokens / total_tokens * 100, 1)
if total_tokens > 0
else 0,
},
"length_stats": {
"avg_length": round(avg_length, 2),
"std_dev": round(std_dev, 2),
"min_length": min(lengths) if lengths else 0,
"max_length": max(lengths) if lengths else 0,
"median_length": median_length,
},
}
return (get_token_stats,)
@app.cell
def _(hashlib):
def get_varied_color(token: str) -> dict:
"""Generate vibrant colors with HSL for better visual distinction."""
# Use a fixed salt or seed if you want consistent colors across runs for the same token
token_hash = hashlib.md5(token.encode()).hexdigest()
hue = int(token_hash[:3], 16) % 360
saturation = 70 + (int(token_hash[3:5], 16) % 20) # Saturation between 70-90%
lightness = 80 + (
int(token_hash[5:7], 16) % 10
) # Lightness between 80-90% (light background)
# Ensure text color contrasts well with the light background
text_lightness = 20 # Dark text for light background
return {
"background": f"hsl({hue}, {saturation}%, {lightness}%)",
"text": f"hsl({hue}, {saturation}%, {text_lightness}%)",
}
return (get_varied_color,)
@app.function
def fix_token(token: str) -> str:
"""Fix token for display with improved space visualization."""
# Replace SentencePiece space marker U+2581 with a middle dot
token = token.replace(" ", "·")
# Replace BPE space marker 'Ġ' with a middle dot
if token.startswith("Ġ"):
space_count = token.count("Ġ")
return "·" * space_count + token[space_count:]
# Replace newline markers for display
token = token.replace(
"Ċ", "↵\n"
) # Replace newline marker with symbol and actual newline
token = token.replace("<0x0A>", "↵\n") # Handle byte representation of newline
return token
@app.function
def get_tokenizer_info(tokenizer):
"""
Extract useful information from a tokenizer.
Returns a dictionary with tokenizer details.
"""
info = {}
try:
# Get vocabulary size (dictionary size)
if hasattr(tokenizer, "vocab_size"):
info["vocab_size"] = tokenizer.vocab_size
elif hasattr(tokenizer, "get_vocab"):
info["vocab_size"] = len(tokenizer.get_vocab())
# Get model max length if available
if (
hasattr(tokenizer, "model_max_length")
and tokenizer.model_max_length < 1000000
): # Sanity check for realistic values
info["model_max_length"] = tokenizer.model_max_length
else:
info["model_max_length"] = "Not specified or very large"
# Check tokenizer type
info["tokenizer_type"] = tokenizer.__class__.__name__
# Get special tokens using the recommended attributes/methods
special_tokens = {}
# Prefer all_special_tokens if available
if hasattr(tokenizer, "all_special_tokens"):
for token in tokenizer.all_special_tokens:
# Try to find the attribute name corresponding to the token value
token_name = "unknown_special_token" # Default name
for attr_name in [
"pad_token",
"eos_token",
"bos_token",
"sep_token",
"cls_token",
"unk_token",
"mask_token",
]:
if (
hasattr(tokenizer, attr_name)
and getattr(tokenizer, attr_name) == token
):
token_name = attr_name
break
if token and str(token).strip():
special_tokens[token_name] = str(token)
else:
# Fallback to checking individual attributes
for token_name in [
"pad_token",
"eos_token",
"bos_token",
"sep_token",
"cls_token",
"unk_token",
"mask_token",
]:
if (
hasattr(tokenizer, token_name)
and getattr(tokenizer, token_name) is not None
):
token_value = getattr(tokenizer, token_name)
if token_value and str(token_value).strip():
special_tokens[token_name] = str(token_value)
info["special_tokens"] = special_tokens if special_tokens else "None found"
except Exception as e:
info["error"] = f"Error extracting tokenizer info: {str(e)}"
return info
@app.cell
def _(mo):
show_ids_switch = mo.ui.switch(label="Show Token IDs instead of Text", value=False)
return (show_ids_switch,)
@app.cell
def _(
current_text,
get_token_stats,
get_varied_color,
llm_tokenizer_selector,
mo,
show_ids_switch,
tokenizer,
):
# --- Tokenization and Data Preparation ---
# Get tokenizer metadata
tokenizer_info = get_tokenizer_info(tokenizer)
# Tokenize the input text
# Use tokenize to get string representations for analysis and display
all_tokens = tokenizer.tokenize(current_text)
total_token_count = len(all_tokens)
# Limit the number of tokens for display to avoid browser slowdown
display_limit = 1000
display_tokens = all_tokens[:display_limit]
display_limit_reached = total_token_count > display_limit
# Generate data for visualization
llm_token_data = []
for idx, token in enumerate(display_tokens):
colors = get_varied_color(token)
fixed_token_display = fix_token(token) # Apply fixes for display
# Handle potential errors during ID conversion (e.g., unknown tokens if not handled by tokenizer)
try:
token_id = tokenizer.convert_tokens_to_ids(token)
except KeyError:
token_id = (
tokenizer.unk_token_id if hasattr(tokenizer, "unk_token_id") else -1
) # Use UNK id or -1
llm_token_data.append(
{
"original": token,
"display": fixed_token_display,
"colors": colors,
"is_newline": "↵"
in fixed_token_display, # Check if it represents a newline
"token_id": token_id,
"token_index": idx,
}
)
# Calculate statistics using the full token list
token_stats = get_token_stats(all_tokens, current_text)
# Construct HTML for colored tokens
html_parts = []
for item in llm_token_data:
# Use pre-wrap to respect spaces and newlines within the token display
style = f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;"
# Add title attribute for hover info (original token + ID)
title = f"Original: {item['original']}\nID: {item['token_id']}"
display_content = (
str(item["token_id"]) if show_ids_switch.value else item["display"]
)
html_parts.append(
f'<span style="{style}" title="{title}">{display_content}</span>'
)
token_viz_html = mo.Html(
f'<div style="line-height: 1.6;">{"".join(html_parts)}</div>'
)
basic_stats = token_stats["basic_stats"]
length_stats = token_stats["length_stats"]
basic_stats_md = "**Basic Stats:**\n\n" + "\n".join(
f"- **{key.replace('_', ' ').title()}:** `{value}`"
for key, value in basic_stats.items()
)
length_stats_md = "**Length (Character) Stats:**\n\n" + "\n".join(
f"- **{key.replace('_', ' ').title()}:** `{value}`"
for key, value in length_stats.items()
)
mo.md(f"""# LLM tokenizer: {llm_tokenizer_selector.value}
{show_ids_switch}
## Tokenizer output
{mo.as_html(token_viz_html)}
## Token Statistics
{basic_stats_md}
{length_stats_md}
""")
return
@app.cell
def _():
return
if __name__ == "__main__":
app.run()