Spaces:

bor
/

counting_words

Running

File size: 18,881 Bytes

# /// script
# [tool.marimo.runtime]
# auto_instantiate = false
# ///

import marimo

__generated_with = "0.13.0"
app = marimo.App(width="medium")


@app.cell
def _():
    import hashlib
    import math

    import altair as alt
    import marimo as mo
    import polars as pl
    import spacy
    from transformers import AutoTokenizer

    # Load spaCy models for English and Japanese
    nlp_en = spacy.load("en_core_web_md")
    nlp_ja = spacy.load("ja_core_news_md")

    # List of tokenizer models
    llm_model_choices = [
        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
        "google/gemma-3-27b-it",
        "deepseek-ai/DeepSeek-R1",
        "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
        "Qwen/Qwen2.5-72B-Instruct",
        "google-bert/bert-large-uncased",
        "openai-community/gpt2",
    ]

    return (
        AutoTokenizer,
        alt,
        hashlib,
        llm_model_choices,
        math,
        mo,
        nlp_en,
        nlp_ja,
        pl,
    )


@app.cell
def _(mo):
    mo.md("# Tokenization for English and Japanese")
    return


@app.cell
def _(mo):
    # Central state for the text input content
    get_text_content, set_text_content = mo.state("")
    return get_text_content, set_text_content


@app.cell
def _(mo):
    # Placeholder texts
    en_placeholder = """
    Mrs. Ferrars died on the night of the 16th⁠–⁠17th September⁠—a Thursday. I was sent for at eight o’clock on the morning of Friday the 17th. There was nothing to be done. She had been dead some hours.
    """.strip()
    ja_placeholder = """
    吾輩は猫である。名前はまだ無い。
    　どこで生れたかとんと見当がつかぬ。何でも薄暗いじめじめした所でニャーニャー泣いていた事だけは記憶している。
    """.strip()

    # Create UI element for language selection
    language_selector = mo.ui.radio(
        options=["English", "Japanese"], value="English", label="Language"
    )

    # Return selector and placeholders
    return en_placeholder, ja_placeholder, language_selector


@app.cell
def _(
    en_placeholder,
    get_text_content,
    ja_placeholder,
    language_selector,
    mo,
    set_text_content,
):
    # Define text_input dynamically based on language
    current_placeholder = (
        en_placeholder if language_selector.value == "English" else ja_placeholder
    )
    text_input = mo.ui.text_area(
        # Read value from state
        value=get_text_content(),
        label="Enter text",
        placeholder=current_placeholder,
        full_width=True,
        # Update state on user input
        on_change=lambda v: set_text_content(v),
    )
    return current_placeholder, text_input


@app.cell
def _(current_placeholder, mo, set_text_content):
    def apply_placeholder():
        set_text_content(current_placeholder)

    apply_placeholder_button = mo.ui.button(
        label="Use Placeholder Text", on_click=lambda _: apply_placeholder()
    )
    return (apply_placeholder_button,)


@app.cell
def _(apply_placeholder_button, language_selector, mo, text_input):
    mo.vstack(
        [
            text_input,
            mo.hstack([language_selector, apply_placeholder_button], justify="start"),
        ]
    )
    return


@app.cell
def _(get_text_content, language_selector, mo, nlp_en, nlp_ja):
    # Analyze text using spaCy based on selected language
    # Read text from state
    current_text = get_text_content()
    if language_selector.value == "English":
        doc = nlp_en(current_text)
    else:
        doc = nlp_ja(current_text)

    # Tokenized version and count
    tokenized_text = [token.text for token in doc]
    token_count = len(tokenized_text)

    mo.md(
        f"**Tokenized Text:** {' | '.join(tokenized_text)}\n\n**Token Count:** {token_count}"
    )
    return current_text, doc


@app.cell
def _(doc, mo, pl):
    # Create a polars DataFrame with token attributes
    token_data = pl.DataFrame(
        {
            "Token": [token.text for token in doc],
            "Lemma": [token.lemma_ for token in doc],
            "POS": [token.pos_ for token in doc],
            "Tag": [token.tag_ for token in doc],
            "Morph": [
                str(token.morph) for token in doc
            ],  # To be more precise, this should be merged back in via .to_dict()
            "Token Position": list(range(len(doc))),
            "Sentence Number": [
                i for i, sent in enumerate(doc.sents) for token in sent
            ],
        }
    )

    mo.ui.dataframe(token_data, page_size=50)
    return (token_data,)


@app.cell
def _(mo):
    # Create UI element for selecting the column to visualize
    column_selector = mo.ui.dropdown(
        options=["POS", "Tag", "Lemma", "Token", "Morph"],
        value="POS",
        label="Select column to visualize",
    )

    column_selector
    return (column_selector,)


@app.cell
def _(alt, column_selector, mo, token_data):
    mo.stop(token_data.is_empty(), "Please set input text.")

    selected_column = column_selector.value
    # Calculate value counts for the selected column
    counts_df = (
        token_data[selected_column]
        .value_counts()
        .sort(by=["count", selected_column], descending=[True, False])
    )

    chart = (
        alt.Chart(counts_df)
        .mark_bar()
        .encode(
            x=alt.X("count", title="Frequency"),
            y=alt.Y(selected_column, title=selected_column, sort=None),
            tooltip=[selected_column, "count"],
        )
        .properties(title=f"{selected_column} Distribution")
        .interactive()
    )
    mo.ui.altair_chart(chart)
    return


@app.cell
def _(llm_model_choices, mo):
    # UI for selecting the LLM tokenizer model
    llm_tokenizer_selector = mo.ui.dropdown(
        options=llm_model_choices,
        value=llm_model_choices[-1],  # Default to gpt2 for faster loading initially
        label="Select LLM Tokenizer Model",
    )
    llm_tokenizer_selector
    return (llm_tokenizer_selector,)


@app.cell
def _(AutoTokenizer, llm_tokenizer_selector):
    # Load the selected tokenizer
    # Adapted code from: https://huggingface.co/spaces/barttee/tokenizers/blob/main/app.py
    # This cell will re-run when llm_tokenizer_selector.value changes
    # Marimo caches the result implicitly based on inputs
    selected_model_name = llm_tokenizer_selector.value
    tokenizer = AutoTokenizer.from_pretrained(selected_model_name)
    return (tokenizer,)


@app.cell
def _(math):
    # Function to calculate token statistics
    def get_token_stats(tokens: list, original_text: str) -> dict:
        """Calculate enhanced statistics about the tokens."""
        if not tokens:
            return {  # Return default structure even for empty input
                "basic_stats": {
                    "total_tokens": 0,
                    "unique_tokens": 0,
                    "compression_ratio": 0,
                    "space_tokens": 0,
                    "newline_tokens": 0,
                    "special_tokens": 0,
                    "punctuation_tokens": 0,
                    "unique_percentage": 0,
                },
                "length_stats": {
                    "avg_length": 0,
                    "std_dev": 0,
                    "min_length": 0,
                    "max_length": 0,
                    "median_length": 0,
                },
            }

        total_tokens = len(tokens)
        unique_tokens = len(set(tokens))
        # Handle potential division by zero if total_tokens is 0 (already checked by `if not tokens`)
        avg_length = (
            sum(len(t) for t in tokens) / total_tokens if total_tokens > 0 else 0
        )
        # Handle potential division by zero if total_tokens is 0
        compression_ratio = len(original_text) / total_tokens if total_tokens > 0 else 0

        # Token type analysis (Note: Heuristics might vary between tokenizers)
        # Using startswith(('Ġ', ' ')) covers common space markers like SentencePiece's U+2581 and BPE's 'Ġ'
        space_tokens = sum(1 for t in tokens if t.startswith(("Ġ", " ")))
        # Check for common newline representations
        newline_tokens = sum(
            1 for t in tokens if "Ċ" in t or t == "\n" or t == "<0x0A>"
        )
        # A broader definition for special tokens based on common patterns (control tokens)
        special_tokens = sum(
            1
            for t in tokens
            if (t.startswith("<") and t.endswith(">"))
            or (t.startswith("[") and t.endswith("]"))
        )
        # Simple punctuation check (might overlap with other categories, focuses on single char punct)
        punctuation_tokens = sum(
            1
            for t in tokens
            if len(t) == 1 and not t.isalnum() and t not in [" ", "\n", "Ġ", "Ċ"]
        )

        # Length distribution
        lengths = [len(t) for t in tokens]
        if not lengths:  # Should not happen if tokens is not empty, but safe check
            return {
                "basic_stats": {
                    "total_tokens": 0,
                    "unique_tokens": 0,
                    "compression_ratio": 0,
                    "space_tokens": 0,
                    "newline_tokens": 0,
                    "special_tokens": 0,
                    "punctuation_tokens": 0,
                    "unique_percentage": 0,
                },
                "length_stats": {
                    "avg_length": 0,
                    "std_dev": 0,
                    "min_length": 0,
                    "max_length": 0,
                    "median_length": 0,
                },
            }

        mean_length = sum(lengths) / len(lengths)
        variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
        std_dev = math.sqrt(variance)
        sorted_lengths = sorted(lengths)
        # Handle case where lengths list might be empty after filtering, though unlikely here
        median_length = sorted_lengths[len(lengths) // 2] if lengths else 0

        return {
            "basic_stats": {
                "total_tokens": total_tokens,
                "unique_tokens": unique_tokens,
                "compression_ratio": round(compression_ratio, 2),
                "space_tokens": space_tokens,
                "newline_tokens": newline_tokens,
                "special_tokens": special_tokens,
                "punctuation_tokens": punctuation_tokens,
                "unique_percentage": round(unique_tokens / total_tokens * 100, 1)
                if total_tokens > 0
                else 0,
            },
            "length_stats": {
                "avg_length": round(avg_length, 2),
                "std_dev": round(std_dev, 2),
                "min_length": min(lengths) if lengths else 0,
                "max_length": max(lengths) if lengths else 0,
                "median_length": median_length,
            },
        }

    return (get_token_stats,)


@app.cell
def _(hashlib):
    def get_varied_color(token: str) -> dict:
        """Generate vibrant colors with HSL for better visual distinction."""
        # Use a fixed salt or seed if you want consistent colors across runs for the same token
        token_hash = hashlib.md5(token.encode()).hexdigest()
        hue = int(token_hash[:3], 16) % 360
        saturation = 70 + (int(token_hash[3:5], 16) % 20)  # Saturation between 70-90%
        lightness = 80 + (
            int(token_hash[5:7], 16) % 10
        )  # Lightness between 80-90% (light background)
        # Ensure text color contrasts well with the light background
        text_lightness = 20  # Dark text for light background

        return {
            "background": f"hsl({hue}, {saturation}%, {lightness}%)",
            "text": f"hsl({hue}, {saturation}%, {text_lightness}%)",
        }

    return (get_varied_color,)


@app.function
def fix_token(token: str) -> str:
    """Fix token for display with improved space visualization."""
    # Replace SentencePiece space marker U+2581 with a middle dot
    token = token.replace(" ", "·")
    # Replace BPE space marker 'Ġ' with a middle dot
    if token.startswith("Ġ"):
        space_count = token.count("Ġ")
        return "·" * space_count + token[space_count:]
    # Replace newline markers for display
    token = token.replace(
        "Ċ", "↵\n"
    )  # Replace newline marker with symbol and actual newline
    token = token.replace("<0x0A>", "↵\n")  # Handle byte representation of newline
    return token


@app.function
def get_tokenizer_info(tokenizer):
    """
    Extract useful information from a tokenizer.
    Returns a dictionary with tokenizer details.
    """

    info = {}
    try:
        # Get vocabulary size (dictionary size)
        if hasattr(tokenizer, "vocab_size"):
            info["vocab_size"] = tokenizer.vocab_size
        elif hasattr(tokenizer, "get_vocab"):
            info["vocab_size"] = len(tokenizer.get_vocab())

        # Get model max length if available
        if (
            hasattr(tokenizer, "model_max_length")
            and tokenizer.model_max_length < 1000000
        ):  # Sanity check for realistic values
            info["model_max_length"] = tokenizer.model_max_length
        else:
            info["model_max_length"] = "Not specified or very large"

        # Check tokenizer type
        info["tokenizer_type"] = tokenizer.__class__.__name__

        # Get special tokens using the recommended attributes/methods
        special_tokens = {}
        # Prefer all_special_tokens if available
        if hasattr(tokenizer, "all_special_tokens"):
            for token in tokenizer.all_special_tokens:
                # Try to find the attribute name corresponding to the token value
                token_name = "unknown_special_token"  # Default name
                for attr_name in [
                    "pad_token",
                    "eos_token",
                    "bos_token",
                    "sep_token",
                    "cls_token",
                    "unk_token",
                    "mask_token",
                ]:
                    if (
                        hasattr(tokenizer, attr_name)
                        and getattr(tokenizer, attr_name) == token
                    ):
                        token_name = attr_name
                        break
                if token and str(token).strip():
                    special_tokens[token_name] = str(token)
        else:
            # Fallback to checking individual attributes
            for token_name in [
                "pad_token",
                "eos_token",
                "bos_token",
                "sep_token",
                "cls_token",
                "unk_token",
                "mask_token",
            ]:
                if (
                    hasattr(tokenizer, token_name)
                    and getattr(tokenizer, token_name) is not None
                ):
                    token_value = getattr(tokenizer, token_name)
                    if token_value and str(token_value).strip():
                        special_tokens[token_name] = str(token_value)

        info["special_tokens"] = special_tokens if special_tokens else "None found"

    except Exception as e:
        info["error"] = f"Error extracting tokenizer info: {str(e)}"

    return info


@app.cell
def _(mo):
    show_ids_switch = mo.ui.switch(label="Show Token IDs instead of Text", value=False)
    return (show_ids_switch,)


@app.cell
def _(
    current_text,
    get_token_stats,
    get_varied_color,
    llm_tokenizer_selector,
    mo,
    show_ids_switch,
    tokenizer,
):
    # --- Tokenization and Data Preparation ---

    # Get tokenizer metadata
    tokenizer_info = get_tokenizer_info(tokenizer)

    # Tokenize the input text
    # Use tokenize to get string representations for analysis and display
    all_tokens = tokenizer.tokenize(current_text)
    total_token_count = len(all_tokens)

    # Limit the number of tokens for display to avoid browser slowdown
    display_limit = 1000
    display_tokens = all_tokens[:display_limit]
    display_limit_reached = total_token_count > display_limit

    # Generate data for visualization
    llm_token_data = []
    for idx, token in enumerate(display_tokens):
        colors = get_varied_color(token)
        fixed_token_display = fix_token(token)  # Apply fixes for display
        # Handle potential errors during ID conversion (e.g., unknown tokens if not handled by tokenizer)
        try:
            token_id = tokenizer.convert_tokens_to_ids(token)
        except KeyError:
            token_id = (
                tokenizer.unk_token_id if hasattr(tokenizer, "unk_token_id") else -1
            )  # Use UNK id or -1

        llm_token_data.append(
            {
                "original": token,
                "display": fixed_token_display,
                "colors": colors,
                "is_newline": "↵"
                in fixed_token_display,  # Check if it represents a newline
                "token_id": token_id,
                "token_index": idx,
            }
        )

    # Calculate statistics using the full token list
    token_stats = get_token_stats(all_tokens, current_text)

    # Construct HTML for colored tokens
    html_parts = []
    for item in llm_token_data:
        # Use pre-wrap to respect spaces and newlines within the token display
        style = f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;"
        # Add title attribute for hover info (original token + ID)
        title = f"Original: {item['original']}\nID: {item['token_id']}"
        display_content = (
            str(item["token_id"]) if show_ids_switch.value else item["display"]
        )
        html_parts.append(
            f'<span style="{style}" title="{title}">{display_content}</span>'
        )

    token_viz_html = mo.Html(
        f'<div style="line-height: 1.6;">{"".join(html_parts)}</div>'
    )

    basic_stats = token_stats["basic_stats"]
    length_stats = token_stats["length_stats"]

    basic_stats_md = "**Basic Stats:**\n\n" + "\n".join(
        f"-   **{key.replace('_', ' ').title()}:** `{value}`"
        for key, value in basic_stats.items()
    )

    length_stats_md = "**Length (Character) Stats:**\n\n" + "\n".join(
        f"-   **{key.replace('_', ' ').title()}:** `{value}`"
        for key, value in length_stats.items()
    )

    mo.md(f"""# LLM tokenizer: {llm_tokenizer_selector.value}

    {show_ids_switch}

    ## Tokenizer output

    {mo.as_html(token_viz_html)}

    ## Token Statistics

    {basic_stats_md}

    {length_stats_md}

    """)
    return


@app.cell
def _():
    return


if __name__ == "__main__":
    app.run()