Spaces:

bor
/

counting_words

Running

App Files Files Community

Bor Hodošček commited on 7 days ago

Commit

4d52104

unverified ·

1 Parent(s): 75e7ffd

feat: fix dockerfile

Browse files

Files changed (4) hide show

Dockerfile +1 -1
app.py +488 -383
pyproject.toml +19 -0
requirements.txt +0 -5

Dockerfile CHANGED Viewed

@@ -13,4 +13,4 @@ RUN uv sync
 COPY --chown=user . /app
 USER user
-CMD ["marimo", "run", "app.py", "--include-code", "--host", "0.0.0.0", "--port", "7860"]

 COPY --chown=user . /app
 USER user
+CMD ["uv", "run", "marimo", "run", "app.py", "--include-code", "--host", "0.0.0.0", "--port", "7860"]

app.py CHANGED Viewed

@@ -1,470 +1,575 @@
 import marimo
-__generated_with = "0.9.2"
-app = marimo.App()
 @app.cell
-def __():
     import marimo as mo
-    mo.md("# Welcome to marimo! 🌊🍃")
-    return (mo,)
 @app.cell
-def __(mo):
-    slider = mo.ui.slider(1, 22)
-    return (slider,)
 @app.cell
-def __(mo, slider):
-    mo.md(
-        f"""
-        marimo is a **reactive** Python notebook.
-        This means that unlike traditional notebooks, marimo notebooks **run
-        automatically** when you modify them or
-        interact with UI elements, like this slider: {slider}.
-        {"##" + "🍃" * slider.value}
-        """
     )
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.accordion(
-        {
-            "Tip: disabling automatic execution": mo.md(
-                rf"""
-            marimo lets you disable automatic execution: just go into the
-            notebook settings and set
-            "Runtime > On Cell Change" to "lazy".
-            When the runtime is lazy, after running a cell, marimo marks its
-            descendants as stale instead of automatically running them. The
-            lazy runtime puts you in control over when cells are run, while
-            still giving guarantees about the notebook state.
-            """
-            )
-        }
-    )
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        Tip: This is a tutorial notebook. You can create your own notebooks
-        by entering `marimo edit` at the command line.
-        """
-    ).callout()
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        ## 1. Reactive execution
-        A marimo notebook is made up of small blocks of Python code called
-        cells.
-        marimo reads your cells and models the dependencies among them: whenever
-        a cell that defines a global variable  is run, marimo
-        **automatically runs** all cells that reference that variable.
-        Reactivity keeps your program state and outputs in sync with your code,
-        making for a dynamic programming environment that prevents bugs before they
-        happen.
-        """
     )
-    return
-@app.cell(hide_code=True)
-def __(changed, mo):
-    (
-        mo.md(
-            f"""
-            **✨ Nice!** The value of `changed` is now {changed}.
-            When you updated the value of the variable `changed`, marimo
-            **reacted** by running this cell automatically, because this cell
-            references the global variable `changed`.
-            Reactivity ensures that your notebook state is always
-            consistent, which is crucial for doing good science; it's also what
-            enables marimo notebooks to double as tools and  apps.
-            """
-        )
-        if changed
-        else mo.md(
-            """
-            **🌊 See it in action.** In the next cell, change the value of the
-            variable  `changed` to `True`, then click the run button.
-            """
-        )
     )
-    return
 @app.cell
-def __():
-    changed = False
-    return (changed,)
-@app.cell(hide_code=True)
-def __(mo):
-    mo.accordion(
-        {
-            "Tip: execution order": (
-                """
-                The order of cells on the page has no bearing on
-                the order in which cells are executed: marimo knows that a cell
-                reading a variable must run after the cell that  defines it. This
-                frees you to organize your code in the way that makes the most
-                sense for you.
-                """
-            )
-        }
     )
     return
-@app.cell(hide_code=True)
-def __(mo):
     mo.md(
-        """
-        **Global names must be unique.** To enable reactivity, marimo imposes a
-        constraint on how names appear in cells: no two cells may define the same
-        variable.
-        """
     )
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.accordion(
         {
-            "Tip: encapsulation": (
-                """
-                By encapsulating logic in functions, classes, or Python modules,
-                you can minimize the number of global variables in your notebook.
-                """
-            )
         }
     )
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.accordion(
-        {
-            "Tip: private variables": (
-                """
-                Variables prefixed with an underscore are "private" to a cell, so
-                they can be defined by multiple cells.
-                """
-            )
-        }
     )
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        ## 2. UI elements
-        Cells can output interactive UI elements. Interacting with a UI
-        element **automatically triggers notebook execution**: when
-        you interact with a UI element, its value is sent back to Python, and
-        every cell that references that element is re-run.
-        marimo provides a library of UI elements to choose from under
-        `marimo.ui`.
-        """
     )
     return
 @app.cell
-def __(mo):
-    mo.md("""**🌊 Some UI elements.** Try interacting with the below elements.""")
-    return
 @app.cell
-def __(mo):
-    icon = mo.ui.dropdown(["🍃", "🌊", "✨"], value="🍃")
-    return (icon,)
 @app.cell
-def __(icon, mo):
-    repetitions = mo.ui.slider(1, 16, label=f"number of {icon.value}: ")
-    return (repetitions,)
-@app.cell
-def __(icon, repetitions):
-    icon, repetitions
-    return
 @app.cell
-def __(icon, mo, repetitions):
-    mo.md("# " + icon.value * repetitions.value)
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        ## 3. marimo is just Python
-        marimo cells parse Python (and only Python), and marimo notebooks are
-        stored as pure Python files — outputs are _not_ included. There's no
-        magical syntax.
-        The Python files generated by marimo are:
-        - easily versioned with git, yielding minimal diffs
-        - legible for both humans and machines
-        - formattable using your tool of choice,
-        - usable as Python  scripts, with UI  elements taking their default
-        values, and
-        - importable by other modules (more on that in the future).
-        """
-    )
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        ## 4. Running notebooks as apps
-        marimo notebooks can double as apps. Click the app window icon in the
-        bottom-right to see this notebook in "app view."
-        Serve a notebook as an app with `marimo run` at the command-line.
-        Of course, you can use marimo just to level-up your
-        notebooking, without ever making apps.
-        """
-    )
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        ## 5. The `marimo` command-line tool
-        **Creating and editing notebooks.** Use
-        ```
-        marimo edit
-        ```
-        in a terminal to start the marimo notebook server. From here
-        you can create a new notebook or edit existing ones.
-        **Running as apps.** Use
-        ```
-        marimo run notebook.py
-        ```
-        to start a webserver that serves your notebook as an app in read-only mode,
-        with code cells hidden.
-        **Convert a Jupyter notebook.** Convert a Jupyter notebook to a marimo
-        notebook using `marimo convert`:
-        ```
-        marimo convert your_notebook.ipynb > your_app.py
-        ```
-        **Tutorials.** marimo comes packaged with tutorials:
-        - `dataflow`: more on marimo's automatic execution
-        - `ui`: how to use UI elements
-        - `markdown`: how to write markdown, with interpolated values and
-           LaTeX
-        - `plots`: how plotting works in marimo
-        - `sql`: how to use SQL
-        - `layout`: layout elements in marimo
-        - `fileformat`: how marimo's file format works
-        - `markdown-format`: for using `.md` files in marimo
-        - `for-jupyter-users`: if you are coming from Jupyter
-        Start a tutorial with `marimo tutorial`; for example,
-        ```
-        marimo tutorial dataflow
-        ```
-        In addition to tutorials, we have examples in our
-        [our GitHub repo](https://www.github.com/marimo-team/marimo/tree/main/examples).
-        """
-    )
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        ## 6. The marimo editor
-        Here are some tips to help you get started with the marimo editor.
-        """
-    )
     return
 @app.cell
-def __(mo, tips):
-    mo.accordion(tips)
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md("""## Finally, a fun fact""")
     return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        The name "marimo" is a reference to a type of algae that, under
-        the right conditions, clumps together to form a small sphere
-        called a "marimo moss ball". Made of just strands of algae, these
-        beloved assemblages are greater than the sum of their parts.
-        """
-    )
-    return
-@app.cell(hide_code=True)
-def __():
-    tips = {
-        "Saving": (
-            """
-            **Saving**
-            - _Name_ your app using the box at the top of the screen, or
-              with `Ctrl/Cmd+s`. You can also create a named app at the
-              command line, e.g., `marimo edit app_name.py`.
-            - _Save_ by clicking the save icon on the bottom right, or by
-              inputting `Ctrl/Cmd+s`. By default marimo is configured
-              to autosave.
-            """
-        ),
-        "Running": (
-            """
-            1. _Run a cell_ by clicking the play ( ▷ ) button on the top
-            right of a cell, or by inputting `Ctrl/Cmd+Enter`.
-            2. _Run a stale cell_  by clicking the yellow run button on the
-            right of the cell, or by inputting `Ctrl/Cmd+Enter`. A cell is
-            stale when its code has been modified but not run.
-            3. _Run all stale cells_ by clicking the play ( ▷ ) button on
-            the bottom right of the screen, or input `Ctrl/Cmd+Shift+r`.
-            """
-        ),
-        "Console Output": (
-            """
-            Console output (e.g., `print()` statements) is shown below a
-            cell.
-            """
-        ),
-        "Creating, Moving, and Deleting Cells": (
-            """
-            1. _Create_ a new cell above or below a given one by clicking
-                the plus button to the left of the cell, which appears on
-                mouse hover.
-            2. _Move_ a cell up or down by dragging on the handle to the
-                right of the cell, which appears on mouse hover.
-            3. _Delete_ a cell by clicking the trash bin icon. Bring it
-                back by clicking the undo button on the bottom right of the
-                screen, or with `Ctrl/Cmd+Shift+z`.
-            """
-        ),
-        "Disabling Automatic Execution": (
-            """
-            Via the notebook settings (gear icon) or footer panel, you
-            can disable automatic execution. This is helpful when
-            working with expensive notebooks or notebooks that have
-            side-effects like database transactions.
-            """
-        ),
-        "Disabling Cells": (
-            """
-            You can disable a cell via the cell context menu.
-            marimo will never run a disabled cell or any cells that depend on it.
-            This can help prevent accidental execution of expensive computations
-            when editing a notebook.
-            """
-        ),
-        "Code Folding": (
-            """
-            You can collapse or fold the code in a cell by clicking the arrow
-            icons in the line number column to the left, or by using keyboard
-            shortcuts.
-            Use the command palette (`Ctrl/Cmd+k`) or a keyboard shortcut to
-            quickly fold or unfold all cells.
-            """
-        ),
-        "Code Formatting": (
-            """
-            If you have [ruff](https://github.com/astral-sh/ruff) installed,
-            you can format a cell with the keyboard shortcut `Ctrl/Cmd+b`.
-            """
-        ),
-        "Command Palette": (
-            """
-            Use `Ctrl/Cmd+k` to open the command palette.
-            """
-        ),
-        "Keyboard Shortcuts": (
-            """
-            Open the notebook menu (top-right) or input `Ctrl/Cmd+Shift+h` to
-            view a list of all keyboard shortcuts.
-            """
-        ),
-        "Configuration": (
-            """
-           Configure the editor by clicking the gears icon near the top-right
-           of the screen.
-           """
-        ),
-    }
-    return (tips,)
 if __name__ == "__main__":
     app.run()

+# /// script
+# [tool.marimo.runtime]
+# auto_instantiate = false
+# ///
 import marimo
+__generated_with = "0.13.0"
+app = marimo.App(width="medium")
 @app.cell
+def _():
     import marimo as mo
+    import spacy
+    import polars as pl
+    import altair as alt
+    from transformers import AutoTokenizer
+    import math
+    import hashlib
+    # Load spaCy models for English and Japanese
+    nlp_en = spacy.load("en_core_web_md")
+    nlp_ja = spacy.load("ja_core_news_md")
+    # List of tokenizer models
+    llm_model_choices = [
+        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        "google/gemma-3-27b-it",
+        "deepseek-ai/DeepSeek-R1",
+        "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
+        "Qwen/Qwen2.5-72B-Instruct",
+        "google-bert/bert-large-uncased",
+        "openai-community/gpt2",
+    ]
+    return (
+        AutoTokenizer,
+        alt,
+        hashlib,
+        llm_model_choices,
+        math,
+        mo,
+        nlp_en,
+        nlp_ja,
+        pl,
+    )
 @app.cell
+def _(mo):
+    mo.md("# Tokenization for English and Japanese")
+    return
 @app.cell
+def _(mo):
+    # Central state for the text input content
+    get_text_content, set_text_content = mo.state("")
+    return get_text_content, set_text_content
+@app.cell
+def _(mo):
+    # Placeholder texts
+    en_placeholder = """
+    Mrs. Ferrars died on the night of the 16th⁠–⁠17th September⁠—a Thursday. I was sent for at eight o’clock on the morning of Friday the 17th. There was nothing to be done. She had been dead some hours.
+    """.strip()
+    ja_placeholder = """
+    吾輩は猫である。名前はまだ無い。
+    　どこで生れたかとんと見当がつかぬ。何でも薄暗いじめじめした所でニャーニャー泣いていた事だけは記憶している。
+    """.strip()
+    # Create UI element for language selection
+    language_selector = mo.ui.radio(
+        options=["English", "Japanese"], value="English", label="Language"
     )
+    # Return selector and placeholders
+    return en_placeholder, ja_placeholder, language_selector
+@app.cell
+def _(
+    en_placeholder,
+    get_text_content,
+    ja_placeholder,
+    language_selector,
+    mo,
+    set_text_content,
+):
+    # Define text_input dynamically based on language
+    current_placeholder = (
+        en_placeholder if language_selector.value == "English" else ja_placeholder
     )
+    text_input = mo.ui.text_area(
+        # Read value from state
+        value=get_text_content(),
+        label="Enter text",
+        placeholder=current_placeholder,
+        full_width=True,
+        # Update state on user input
+        on_change=lambda v: set_text_content(v),
+    )
+    return current_placeholder, text_input
+@app.cell
+def _(current_placeholder, mo, set_text_content):
+    def apply_placeholder():
+        set_text_content(current_placeholder)
+    apply_placeholder_button = mo.ui.button(
+        label="Use Placeholder Text", on_click=lambda _: apply_placeholder()
     )
+    return (apply_placeholder_button,)
 @app.cell
+def _(apply_placeholder_button, language_selector, mo, text_input):
+    mo.vstack(
+        [
+            text_input,
+            mo.hstack([language_selector, apply_placeholder_button], justify="start"),
+        ]
     )
     return
+@app.cell
+def _(get_text_content, language_selector, mo, nlp_en, nlp_ja):
+    # Analyze text using spaCy based on selected language
+    # Read text from state
+    current_text = get_text_content()
+    if language_selector.value == "English":
+        doc = nlp_en(current_text)
+    else:
+        doc = nlp_ja(current_text)
+    # Tokenized version and count
+    tokenized_text = [token.text for token in doc]
+    token_count = len(tokenized_text)
     mo.md(
+        f"**Tokenized Text:** {' | '.join(tokenized_text)}\n\n**Token Count:** {token_count}"
     )
+    return current_text, doc
+@app.cell
+def _(doc, mo, pl):
+    # Create a polars DataFrame with token attributes
+    token_data = pl.DataFrame(
         {
+            "Token": [token.text for token in doc],
+            "Lemma": [token.lemma_ for token in doc],
+            "POS": [token.pos_ for token in doc],
+            "Tag": [token.tag_ for token in doc],
+            "Morph": [
+                str(token.morph) for token in doc
+            ],  # To be more precise, this should be merged back in via .to_dict()
+            "Token Position": list(range(len(doc))),
+            "Sentence Number": [i for i, sent in enumerate(doc.sents) for token in sent],
         }
     )
+    mo.ui.dataframe(token_data, page_size=50)
+    return (token_data,)
+@app.cell
+def _(mo):
+    # Create UI element for selecting the column to visualize
+    column_selector = mo.ui.dropdown(
+        options=["POS", "Tag", "Lemma", "Token", "Morph"],
+        value="POS",
+        label="Select column to visualize",
     )
+    column_selector
+    return (column_selector,)
+@app.cell
+def _(alt, column_selector, mo, token_data):
+    mo.stop(token_data.is_empty(), "Please set input text.")
+    selected_column = column_selector.value
+    # Calculate value counts for the selected column
+    counts_df = (
+        token_data[selected_column]
+        .value_counts()
+        .sort(by=["count", selected_column], descending=[True, False])
+    )
+    chart = (
+        alt.Chart(counts_df)
+        .mark_bar()
+        .encode(
+            x=alt.X("count", title="Frequency"),
+            y=alt.Y(selected_column, title=selected_column, sort=None),
+            tooltip=[selected_column, "count"],
+        )
+        .properties(title=f"{selected_column} Distribution")
+        .interactive()
     )
+    mo.ui.altair_chart(chart)
     return
 @app.cell
+def _(llm_model_choices, mo):
+    # UI for selecting the LLM tokenizer model
+    llm_tokenizer_selector = mo.ui.dropdown(
+        options=llm_model_choices,
+        value=llm_model_choices[-1],  # Default to gpt2 for faster loading initially
+        label="Select LLM Tokenizer Model",
+    )
+    llm_tokenizer_selector
+    return (llm_tokenizer_selector,)
 @app.cell
+def _(AutoTokenizer, llm_tokenizer_selector):
+    # Load the selected tokenizer
+    # Adapted code from: https://huggingface.co/spaces/barttee/tokenizers/blob/main/app.py
+    # This cell will re-run when llm_tokenizer_selector.value changes
+    # Marimo caches the result implicitly based on inputs
+    selected_model_name = llm_tokenizer_selector.value
+    tokenizer = AutoTokenizer.from_pretrained(selected_model_name)
+    return (tokenizer,)
 @app.cell
+def _(math):
+    # Function to calculate token statistics
+    def get_token_stats(tokens: list, original_text: str) -> dict:
+        """Calculate enhanced statistics about the tokens."""
+        if not tokens:
+            return {  # Return default structure even for empty input
+                "basic_stats": {
+                    "total_tokens": 0,
+                    "unique_tokens": 0,
+                    "compression_ratio": 0,
+                    "space_tokens": 0,
+                    "newline_tokens": 0,
+                    "special_tokens": 0,
+                    "punctuation_tokens": 0,
+                    "unique_percentage": 0,
+                },
+                "length_stats": {
+                    "avg_length": 0,
+                    "std_dev": 0,
+                    "min_length": 0,
+                    "max_length": 0,
+                    "median_length": 0,
+                },
+            }
+        total_tokens = len(tokens)
+        unique_tokens = len(set(tokens))
+        # Handle potential division by zero if total_tokens is 0 (already checked by `if not tokens`)
+        avg_length = (
+            sum(len(t) for t in tokens) / total_tokens if total_tokens > 0 else 0
+        )
+        # Handle potential division by zero if total_tokens is 0
+        compression_ratio = len(original_text) / total_tokens if total_tokens > 0 else 0
+        # Token type analysis (Note: Heuristics might vary between tokenizers)
+        # Using startswith(('Ġ', ' ')) covers common space markers like SentencePiece's U+2581 and BPE's 'Ġ'
+        space_tokens = sum(1 for t in tokens if t.startswith(("Ġ", " ")))
+        # Check for common newline representations
+        newline_tokens = sum(
+            1 for t in tokens if "Ċ" in t or t == "\n" or t == "<0x0A>"
+        )
+        # A broader definition for special tokens based on common patterns (control tokens)
+        special_tokens = sum(
+            1
+            for t in tokens
+            if (t.startswith("<") and t.endswith(">"))
+            or (t.startswith("[") and t.endswith("]"))
+        )
+        # Simple punctuation check (might overlap with other categories, focuses on single char punct)
+        punctuation_tokens = sum(
+            1
+            for t in tokens
+            if len(t) == 1 and not t.isalnum() and t not in [" ", "\n", "Ġ", "Ċ"]
+        )
+        # Length distribution
+        lengths = [len(t) for t in tokens]
+        if not lengths:  # Should not happen if tokens is not empty, but safe check
+            return {
+                "basic_stats": {
+                    "total_tokens": 0,
+                    "unique_tokens": 0,
+                    "compression_ratio": 0,
+                    "space_tokens": 0,
+                    "newline_tokens": 0,
+                    "special_tokens": 0,
+                    "punctuation_tokens": 0,
+                    "unique_percentage": 0,
+                },
+                "length_stats": {
+                    "avg_length": 0,
+                    "std_dev": 0,
+                    "min_length": 0,
+                    "max_length": 0,
+                    "median_length": 0,
+                },
+            }
+        mean_length = sum(lengths) / len(lengths)
+        variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
+        std_dev = math.sqrt(variance)
+        sorted_lengths = sorted(lengths)
+        # Handle case where lengths list might be empty after filtering, though unlikely here
+        median_length = sorted_lengths[len(lengths) // 2] if lengths else 0
+        return {
+            "basic_stats": {
+                "total_tokens": total_tokens,
+                "unique_tokens": unique_tokens,
+                "compression_ratio": round(compression_ratio, 2),
+                "space_tokens": space_tokens,
+                "newline_tokens": newline_tokens,
+                "special_tokens": special_tokens,
+                "punctuation_tokens": punctuation_tokens,
+                "unique_percentage": round(unique_tokens / total_tokens * 100, 1)
+                if total_tokens > 0
+                else 0,
+            },
+            "length_stats": {
+                "avg_length": round(avg_length, 2),
+                "std_dev": round(std_dev, 2),
+                "min_length": min(lengths) if lengths else 0,
+                "max_length": max(lengths) if lengths else 0,
+                "median_length": median_length,
+            },
+        }
+    return (get_token_stats,)
 @app.cell
+def _(hashlib):
+    def get_varied_color(token: str) -> dict:
+        """Generate vibrant colors with HSL for better visual distinction."""
+        # Use a fixed salt or seed if you want consistent colors across runs for the same token
+        token_hash = hashlib.md5(token.encode()).hexdigest()
+        hue = int(token_hash[:3], 16) % 360
+        saturation = 70 + (int(token_hash[3:5], 16) % 20)  # Saturation between 70-90%
+        lightness = 80 + (
+            int(token_hash[5:7], 16) % 10
+        )  # Lightness between 80-90% (light background)
+        # Ensure text color contrasts well with the light background
+        text_lightness = 20  # Dark text for light background
+        return {
+            "background": f"hsl({hue}, {saturation}%, {lightness}%)",
+            "text": f"hsl({hue}, {saturation}%, {text_lightness}%)",
+        }
+    return (get_varied_color,)
+@app.function
+def fix_token(token: str) -> str:
+    """Fix token for display with improved space visualization."""
+    print(token)
+    # Replace SentencePiece space marker U+2581 with a middle dot
+    token = token.replace(" ", "·")
+    # Replace BPE space marker 'Ġ' with a middle dot
+    if token.startswith("Ġ"):
+        space_count = token.count("Ġ")
+        return "·" * space_count + token[space_count:]
+    # Replace newline markers for display
+    token = token.replace(
+        "Ċ", "↵\n"
+    )  # Replace newline marker with symbol and actual newline
+    token = token.replace("<0x0A>", "↵\n")  # Handle byte representation of newline
+    return token
+@app.function
+def get_tokenizer_info(tokenizer):
+    """
+    Extract useful information from a tokenizer.
+    Returns a dictionary with tokenizer details.
+    """
+    print(tokenizer)
+    info = {}
+    try:
+        # Get vocabulary size (dictionary size)
+        if hasattr(tokenizer, "vocab_size"):
+            info["vocab_size"] = tokenizer.vocab_size
+        elif hasattr(tokenizer, "get_vocab"):
+            info["vocab_size"] = len(tokenizer.get_vocab())
+        # Get model max length if available
+        if (
+            hasattr(tokenizer, "model_max_length")
+            and tokenizer.model_max_length < 1000000
+        ):  # Sanity check for realistic values
+            info["model_max_length"] = tokenizer.model_max_length
+        else:
+            info["model_max_length"] = "Not specified or very large"
+        # Check tokenizer type
+        info["tokenizer_type"] = tokenizer.__class__.__name__
+        # Get special tokens using the recommended attributes/methods
+        special_tokens = {}
+        # Prefer all_special_tokens if available
+        if hasattr(tokenizer, "all_special_tokens"):
+            for token in tokenizer.all_special_tokens:
+                # Try to find the attribute name corresponding to the token value
+                token_name = "unknown_special_token"  # Default name
+                for attr_name in [
+                    "pad_token",
+                    "eos_token",
+                    "bos_token",
+                    "sep_token",
+                    "cls_token",
+                    "unk_token",
+                    "mask_token",
+                ]:
+                    if (
+                        hasattr(tokenizer, attr_name)
+                        and getattr(tokenizer, attr_name) == token
+                    ):
+                        token_name = attr_name
+                        break
+                if token and str(token).strip():
+                    special_tokens[token_name] = str(token)
+        else:
+            # Fallback to checking individual attributes
+            for token_name in [
+                "pad_token",
+                "eos_token",
+                "bos_token",
+                "sep_token",
+                "cls_token",
+                "unk_token",
+                "mask_token",
+            ]:
+                if (
+                    hasattr(tokenizer, token_name)
+                    and getattr(tokenizer, token_name) is not None
+                ):
+                    token_value = getattr(tokenizer, token_name)
+                    if token_value and str(token_value).strip():
+                        special_tokens[token_name] = str(token_value)
+        info["special_tokens"] = special_tokens if special_tokens else "None found"
+    except Exception as e:
+        info["error"] = f"Error extracting tokenizer info: {str(e)}"
+    return info
+@app.cell
+def _(mo):
+    show_ids_switch = mo.ui.switch(label="Show Token IDs instead of Text", value=False)
+    return (show_ids_switch,)
+@app.cell
+def _(
+    current_text,
+    get_token_stats,
+    get_varied_color,
+    llm_tokenizer_selector,
+    mo,
+    show_ids_switch,
+    tokenizer,
+):
+    # --- Tokenization and Data Preparation ---
+    # Get tokenizer metadata
+    tokenizer_info = get_tokenizer_info(tokenizer)
+    # Tokenize the input text
+    # Use tokenize to get string representations for analysis and display
+    all_tokens = tokenizer.tokenize(current_text)
+    print(all_tokens)
+    total_token_count = len(all_tokens)
+    # Limit the number of tokens for display to avoid browser slowdown
+    display_limit = 1000
+    display_tokens = all_tokens[:display_limit]
+    display_limit_reached = total_token_count > display_limit
+    # Generate data for visualization
+    llm_token_data = []
+    for idx, token in enumerate(display_tokens):
+        colors = get_varied_color(token)
+        fixed_token_display = fix_token(token)  # Apply fixes for display
+        # Handle potential errors during ID conversion (e.g., unknown tokens if not handled by tokenizer)
+        try:
+            token_id = tokenizer.convert_tokens_to_ids(token)
+        except KeyError:
+            token_id = (
+                tokenizer.unk_token_id if hasattr(tokenizer, "unk_token_id") else -1
+            )  # Use UNK id or -1
+        llm_token_data.append(
+            {
+                "original": token,
+                "display": fixed_token_display,
+                "colors": colors,
+                "is_newline": "↵"
+                in fixed_token_display,  # Check if it represents a newline
+                "token_id": token_id,
+                "token_index": idx,
+            }
+        )
+    # Calculate statistics using the full token list
+    token_stats = get_token_stats(all_tokens, current_text)
+    # Construct HTML for colored tokens
+    html_parts = []
+    for item in llm_token_data:
+        # Use pre-wrap to respect spaces and newlines within the token display
+        style = f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;"
+        # Add title attribute for hover info (original token + ID)
+        title = f"Original: {item['original']}\nID: {item['token_id']}"
+        display_content = str(item["token_id"]) if show_ids_switch.value else item["display"]
+        html_parts.append(
+            f'<span style="{style}" title="{title}">{display_content}</span>'
+        )
+    token_viz_html = mo.Html(f'<div style="line-height: 1.6;">{"".join(html_parts)}</div>')
+    basic_stats = token_stats['basic_stats']
+    length_stats = token_stats['length_stats']
+    basic_stats_md = "**Basic Stats:**\n\n" + "\n".join(
+        f"-   **{key.replace('_', ' ').title()}:** `{value}`"
+        for key, value in basic_stats.items()
+    )
+    length_stats_md = "**Length (Character) Stats:**\n\n" + "\n".join(
+        f"-   **{key.replace('_', ' ').title()}:** `{value}`"
+        for key, value in length_stats.items()
+    )
+    mo.md(f"""# LLM tokenizer: {llm_tokenizer_selector.value}
+    {show_ids_switch}
+    ## Tokenizer output
+    {mo.as_html(token_viz_html)}
+    ## Token Statistics
+    {basic_stats_md}
+    {length_stats_md}
+    """)
     return
 @app.cell
+def _():
     return
 if __name__ == "__main__":
     app.run()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,19 @@

+[project]
+name = "counting-words"
+version = "0.1.0"
+description = "Counting words in English and Japanese texts demo"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "marimo>=0.13.0",
+    "polars>=1.27.1",
+    "altair>=5.5.0",
+    "spacy>=3.8.5",
+    "en-core-web-md",
+    "ja-core-news-md",
+    "transformers>=4.51.3",
+]
+[tool.uv.sources]
+en-core-web-md = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl" }
+ja-core-news-md = { url = "https://github.com/explosion/spacy-models/releases/download/ja_core_news_md-3.8.0/ja_core_news_md-3.8.0-py3-none-any.whl" }

requirements.txt DELETED Viewed

@@ -1,5 +0,0 @@
-marimo
-# Or a specific version
-# marimo>=0.9.0
-# Add other dependencies as needed