Bor Hodošček commited on
Commit
4d52104
·
unverified ·
1 Parent(s): 75e7ffd

feat: fix dockerfile

Browse files
Files changed (4) hide show
  1. Dockerfile +1 -1
  2. app.py +488 -383
  3. pyproject.toml +19 -0
  4. requirements.txt +0 -5
Dockerfile CHANGED
@@ -13,4 +13,4 @@ RUN uv sync
13
  COPY --chown=user . /app
14
  USER user
15
 
16
- CMD ["marimo", "run", "app.py", "--include-code", "--host", "0.0.0.0", "--port", "7860"]
 
13
  COPY --chown=user . /app
14
  USER user
15
 
16
+ CMD ["uv", "run", "marimo", "run", "app.py", "--include-code", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -1,470 +1,575 @@
 
 
 
 
 
1
  import marimo
2
 
3
- __generated_with = "0.9.2"
4
- app = marimo.App()
5
 
6
 
7
  @app.cell
8
- def __():
9
  import marimo as mo
10
-
11
- mo.md("# Welcome to marimo! 🌊🍃")
12
- return (mo,)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
 
15
  @app.cell
16
- def __(mo):
17
- slider = mo.ui.slider(1, 22)
18
- return (slider,)
19
 
20
 
21
  @app.cell
22
- def __(mo, slider):
23
- mo.md(
24
- f"""
25
- marimo is a **reactive** Python notebook.
26
 
27
- This means that unlike traditional notebooks, marimo notebooks **run
28
- automatically** when you modify them or
29
- interact with UI elements, like this slider: {slider}.
30
 
31
- {"##" + "🍃" * slider.value}
32
- """
 
 
 
 
 
 
 
 
 
 
 
 
33
  )
34
- return
35
-
36
 
37
- @app.cell(hide_code=True)
38
- def __(mo):
39
- mo.accordion(
40
- {
41
- "Tip: disabling automatic execution": mo.md(
42
- rf"""
43
- marimo lets you disable automatic execution: just go into the
44
- notebook settings and set
45
-
46
- "Runtime > On Cell Change" to "lazy".
47
-
48
- When the runtime is lazy, after running a cell, marimo marks its
49
- descendants as stale instead of automatically running them. The
50
- lazy runtime puts you in control over when cells are run, while
51
- still giving guarantees about the notebook state.
52
- """
53
- )
54
- }
55
- )
56
- return
57
 
58
 
59
- @app.cell(hide_code=True)
60
- def __(mo):
61
- mo.md(
62
- """
63
- Tip: This is a tutorial notebook. You can create your own notebooks
64
- by entering `marimo edit` at the command line.
65
- """
66
- ).callout()
67
- return
68
-
69
-
70
- @app.cell(hide_code=True)
71
- def __(mo):
72
- mo.md(
73
- """
74
- ## 1. Reactive execution
75
-
76
- A marimo notebook is made up of small blocks of Python code called
77
- cells.
78
-
79
- marimo reads your cells and models the dependencies among them: whenever
80
- a cell that defines a global variable is run, marimo
81
- **automatically runs** all cells that reference that variable.
82
-
83
- Reactivity keeps your program state and outputs in sync with your code,
84
- making for a dynamic programming environment that prevents bugs before they
85
- happen.
86
- """
87
  )
88
- return
89
-
 
 
 
 
 
 
 
 
90
 
91
- @app.cell(hide_code=True)
92
- def __(changed, mo):
93
- (
94
- mo.md(
95
- f"""
96
- **✨ Nice!** The value of `changed` is now {changed}.
97
 
98
- When you updated the value of the variable `changed`, marimo
99
- **reacted** by running this cell automatically, because this cell
100
- references the global variable `changed`.
 
101
 
102
- Reactivity ensures that your notebook state is always
103
- consistent, which is crucial for doing good science; it's also what
104
- enables marimo notebooks to double as tools and apps.
105
- """
106
- )
107
- if changed
108
- else mo.md(
109
- """
110
- **🌊 See it in action.** In the next cell, change the value of the
111
- variable `changed` to `True`, then click the run button.
112
- """
113
- )
114
  )
115
- return
116
 
117
 
118
  @app.cell
119
- def __():
120
- changed = False
121
- return (changed,)
122
-
123
-
124
- @app.cell(hide_code=True)
125
- def __(mo):
126
- mo.accordion(
127
- {
128
- "Tip: execution order": (
129
- """
130
- The order of cells on the page has no bearing on
131
- the order in which cells are executed: marimo knows that a cell
132
- reading a variable must run after the cell that defines it. This
133
- frees you to organize your code in the way that makes the most
134
- sense for you.
135
- """
136
- )
137
- }
138
  )
139
  return
140
 
141
 
142
- @app.cell(hide_code=True)
143
- def __(mo):
 
 
 
 
 
 
 
 
 
 
 
 
144
  mo.md(
145
- """
146
- **Global names must be unique.** To enable reactivity, marimo imposes a
147
- constraint on how names appear in cells: no two cells may define the same
148
- variable.
149
- """
150
  )
151
- return
152
 
153
 
154
- @app.cell(hide_code=True)
155
- def __(mo):
156
- mo.accordion(
 
157
  {
158
- "Tip: encapsulation": (
159
- """
160
- By encapsulating logic in functions, classes, or Python modules,
161
- you can minimize the number of global variables in your notebook.
162
- """
163
- )
 
 
 
164
  }
165
  )
166
- return
167
 
 
 
168
 
169
- @app.cell(hide_code=True)
170
- def __(mo):
171
- mo.accordion(
172
- {
173
- "Tip: private variables": (
174
- """
175
- Variables prefixed with an underscore are "private" to a cell, so
176
- they can be defined by multiple cells.
177
- """
178
- )
179
- }
180
  )
181
- return
182
 
 
 
183
 
184
- @app.cell(hide_code=True)
185
- def __(mo):
186
- mo.md(
187
- """
188
- ## 2. UI elements
189
 
190
- Cells can output interactive UI elements. Interacting with a UI
191
- element **automatically triggers notebook execution**: when
192
- you interact with a UI element, its value is sent back to Python, and
193
- every cell that references that element is re-run.
 
 
 
 
 
 
 
194
 
195
- marimo provides a library of UI elements to choose from under
196
- `marimo.ui`.
197
- """
 
 
 
 
 
 
 
198
  )
 
199
  return
200
 
201
 
202
  @app.cell
203
- def __(mo):
204
- mo.md("""**🌊 Some UI elements.** Try interacting with the below elements.""")
205
- return
 
 
 
 
 
 
206
 
207
 
208
  @app.cell
209
- def __(mo):
210
- icon = mo.ui.dropdown(["🍃", "🌊", "✨"], value="🍃")
211
- return (icon,)
 
 
 
 
 
212
 
213
 
214
  @app.cell
215
- def __(icon, mo):
216
- repetitions = mo.ui.slider(1, 16, label=f"number of {icon.value}: ")
217
- return (repetitions,)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
- @app.cell
221
- def __(icon, repetitions):
222
- icon, repetitions
223
- return
224
 
225
 
226
  @app.cell
227
- def __(icon, mo, repetitions):
228
- mo.md("# " + icon.value * repetitions.value)
229
- return
230
-
231
-
232
- @app.cell(hide_code=True)
233
- def __(mo):
234
- mo.md(
235
- """
236
- ## 3. marimo is just Python
237
-
238
- marimo cells parse Python (and only Python), and marimo notebooks are
239
- stored as pure Python files — outputs are _not_ included. There's no
240
- magical syntax.
241
-
242
- The Python files generated by marimo are:
243
-
244
- - easily versioned with git, yielding minimal diffs
245
- - legible for both humans and machines
246
- - formattable using your tool of choice,
247
- - usable as Python scripts, with UI elements taking their default
248
- values, and
249
- - importable by other modules (more on that in the future).
250
- """
251
- )
252
- return
253
-
254
-
255
- @app.cell(hide_code=True)
256
- def __(mo):
257
- mo.md(
258
- """
259
- ## 4. Running notebooks as apps
260
-
261
- marimo notebooks can double as apps. Click the app window icon in the
262
- bottom-right to see this notebook in "app view."
263
-
264
- Serve a notebook as an app with `marimo run` at the command-line.
265
- Of course, you can use marimo just to level-up your
266
- notebooking, without ever making apps.
267
- """
268
- )
269
- return
270
-
271
 
272
- @app.cell(hide_code=True)
273
- def __(mo):
274
- mo.md(
275
- """
276
- ## 5. The `marimo` command-line tool
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
- **Creating and editing notebooks.** Use
279
 
280
- ```
281
- marimo edit
282
- ```
 
283
 
284
- in a terminal to start the marimo notebook server. From here
285
- you can create a new notebook or edit existing ones.
286
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
 
288
- **Running as apps.** Use
 
 
 
 
 
 
 
 
 
 
 
 
 
289
 
290
- ```
291
- marimo run notebook.py
292
- ```
293
 
294
- to start a webserver that serves your notebook as an app in read-only mode,
295
- with code cells hidden.
296
 
297
- **Convert a Jupyter notebook.** Convert a Jupyter notebook to a marimo
298
- notebook using `marimo convert`:
 
 
299
 
300
- ```
301
- marimo convert your_notebook.ipynb > your_app.py
302
- ```
 
303
 
304
- **Tutorials.** marimo comes packaged with tutorials:
305
 
306
- - `dataflow`: more on marimo's automatic execution
307
- - `ui`: how to use UI elements
308
- - `markdown`: how to write markdown, with interpolated values and
309
- LaTeX
310
- - `plots`: how plotting works in marimo
311
- - `sql`: how to use SQL
312
- - `layout`: layout elements in marimo
313
- - `fileformat`: how marimo's file format works
314
- - `markdown-format`: for using `.md` files in marimo
315
- - `for-jupyter-users`: if you are coming from Jupyter
316
 
317
- Start a tutorial with `marimo tutorial`; for example,
318
 
319
- ```
320
- marimo tutorial dataflow
321
- ```
322
 
323
- In addition to tutorials, we have examples in our
324
- [our GitHub repo](https://www.github.com/marimo-team/marimo/tree/main/examples).
325
- """
326
- )
327
- return
328
 
 
329
 
330
- @app.cell(hide_code=True)
331
- def __(mo):
332
- mo.md(
333
- """
334
- ## 6. The marimo editor
335
 
336
- Here are some tips to help you get started with the marimo editor.
337
- """
338
- )
339
  return
340
 
341
 
342
  @app.cell
343
- def __(mo, tips):
344
- mo.accordion(tips)
345
- return
346
-
347
-
348
- @app.cell(hide_code=True)
349
- def __(mo):
350
- mo.md("""## Finally, a fun fact""")
351
  return
352
 
353
 
354
- @app.cell(hide_code=True)
355
- def __(mo):
356
- mo.md(
357
- """
358
- The name "marimo" is a reference to a type of algae that, under
359
- the right conditions, clumps together to form a small sphere
360
- called a "marimo moss ball". Made of just strands of algae, these
361
- beloved assemblages are greater than the sum of their parts.
362
- """
363
- )
364
- return
365
-
366
-
367
- @app.cell(hide_code=True)
368
- def __():
369
- tips = {
370
- "Saving": (
371
- """
372
- **Saving**
373
-
374
- - _Name_ your app using the box at the top of the screen, or
375
- with `Ctrl/Cmd+s`. You can also create a named app at the
376
- command line, e.g., `marimo edit app_name.py`.
377
-
378
- - _Save_ by clicking the save icon on the bottom right, or by
379
- inputting `Ctrl/Cmd+s`. By default marimo is configured
380
- to autosave.
381
- """
382
- ),
383
- "Running": (
384
- """
385
- 1. _Run a cell_ by clicking the play ( ▷ ) button on the top
386
- right of a cell, or by inputting `Ctrl/Cmd+Enter`.
387
-
388
- 2. _Run a stale cell_ by clicking the yellow run button on the
389
- right of the cell, or by inputting `Ctrl/Cmd+Enter`. A cell is
390
- stale when its code has been modified but not run.
391
-
392
- 3. _Run all stale cells_ by clicking the play ( ▷ ) button on
393
- the bottom right of the screen, or input `Ctrl/Cmd+Shift+r`.
394
- """
395
- ),
396
- "Console Output": (
397
- """
398
- Console output (e.g., `print()` statements) is shown below a
399
- cell.
400
- """
401
- ),
402
- "Creating, Moving, and Deleting Cells": (
403
- """
404
- 1. _Create_ a new cell above or below a given one by clicking
405
- the plus button to the left of the cell, which appears on
406
- mouse hover.
407
-
408
- 2. _Move_ a cell up or down by dragging on the handle to the
409
- right of the cell, which appears on mouse hover.
410
-
411
- 3. _Delete_ a cell by clicking the trash bin icon. Bring it
412
- back by clicking the undo button on the bottom right of the
413
- screen, or with `Ctrl/Cmd+Shift+z`.
414
- """
415
- ),
416
- "Disabling Automatic Execution": (
417
- """
418
- Via the notebook settings (gear icon) or footer panel, you
419
- can disable automatic execution. This is helpful when
420
- working with expensive notebooks or notebooks that have
421
- side-effects like database transactions.
422
- """
423
- ),
424
- "Disabling Cells": (
425
- """
426
- You can disable a cell via the cell context menu.
427
- marimo will never run a disabled cell or any cells that depend on it.
428
- This can help prevent accidental execution of expensive computations
429
- when editing a notebook.
430
- """
431
- ),
432
- "Code Folding": (
433
- """
434
- You can collapse or fold the code in a cell by clicking the arrow
435
- icons in the line number column to the left, or by using keyboard
436
- shortcuts.
437
-
438
- Use the command palette (`Ctrl/Cmd+k`) or a keyboard shortcut to
439
- quickly fold or unfold all cells.
440
- """
441
- ),
442
- "Code Formatting": (
443
- """
444
- If you have [ruff](https://github.com/astral-sh/ruff) installed,
445
- you can format a cell with the keyboard shortcut `Ctrl/Cmd+b`.
446
- """
447
- ),
448
- "Command Palette": (
449
- """
450
- Use `Ctrl/Cmd+k` to open the command palette.
451
- """
452
- ),
453
- "Keyboard Shortcuts": (
454
- """
455
- Open the notebook menu (top-right) or input `Ctrl/Cmd+Shift+h` to
456
- view a list of all keyboard shortcuts.
457
- """
458
- ),
459
- "Configuration": (
460
- """
461
- Configure the editor by clicking the gears icon near the top-right
462
- of the screen.
463
- """
464
- ),
465
- }
466
- return (tips,)
467
-
468
-
469
  if __name__ == "__main__":
470
  app.run()
 
1
+ # /// script
2
+ # [tool.marimo.runtime]
3
+ # auto_instantiate = false
4
+ # ///
5
+
6
  import marimo
7
 
8
+ __generated_with = "0.13.0"
9
+ app = marimo.App(width="medium")
10
 
11
 
12
  @app.cell
13
+ def _():
14
  import marimo as mo
15
+ import spacy
16
+ import polars as pl
17
+ import altair as alt
18
+ from transformers import AutoTokenizer
19
+ import math
20
+ import hashlib
21
+
22
+ # Load spaCy models for English and Japanese
23
+ nlp_en = spacy.load("en_core_web_md")
24
+ nlp_ja = spacy.load("ja_core_news_md")
25
+
26
+ # List of tokenizer models
27
+ llm_model_choices = [
28
+ "meta-llama/Llama-4-Scout-17B-16E-Instruct",
29
+ "google/gemma-3-27b-it",
30
+ "deepseek-ai/DeepSeek-R1",
31
+ "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
32
+ "Qwen/Qwen2.5-72B-Instruct",
33
+ "google-bert/bert-large-uncased",
34
+ "openai-community/gpt2",
35
+ ]
36
+
37
+ return (
38
+ AutoTokenizer,
39
+ alt,
40
+ hashlib,
41
+ llm_model_choices,
42
+ math,
43
+ mo,
44
+ nlp_en,
45
+ nlp_ja,
46
+ pl,
47
+ )
48
 
49
 
50
  @app.cell
51
+ def _(mo):
52
+ mo.md("# Tokenization for English and Japanese")
53
+ return
54
 
55
 
56
  @app.cell
57
+ def _(mo):
58
+ # Central state for the text input content
59
+ get_text_content, set_text_content = mo.state("")
60
+ return get_text_content, set_text_content
61
 
 
 
 
62
 
63
+ @app.cell
64
+ def _(mo):
65
+ # Placeholder texts
66
+ en_placeholder = """
67
+ Mrs. Ferrars died on the night of the 16th⁠–⁠17th September⁠—a Thursday. I was sent for at eight o’clock on the morning of Friday the 17th. There was nothing to be done. She had been dead some hours.
68
+ """.strip()
69
+ ja_placeholder = """
70
+ 吾輩は猫である。名前はまだ無い。
71
+  どこで生れたかとんと見当がつかぬ。何でも薄暗いじめじめした所でニャーニャー泣いていた事だけは記憶している。
72
+ """.strip()
73
+
74
+ # Create UI element for language selection
75
+ language_selector = mo.ui.radio(
76
+ options=["English", "Japanese"], value="English", label="Language"
77
  )
 
 
78
 
79
+ # Return selector and placeholders
80
+ return en_placeholder, ja_placeholder, language_selector
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
 
83
+ @app.cell
84
+ def _(
85
+ en_placeholder,
86
+ get_text_content,
87
+ ja_placeholder,
88
+ language_selector,
89
+ mo,
90
+ set_text_content,
91
+ ):
92
+ # Define text_input dynamically based on language
93
+ current_placeholder = (
94
+ en_placeholder if language_selector.value == "English" else ja_placeholder
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  )
96
+ text_input = mo.ui.text_area(
97
+ # Read value from state
98
+ value=get_text_content(),
99
+ label="Enter text",
100
+ placeholder=current_placeholder,
101
+ full_width=True,
102
+ # Update state on user input
103
+ on_change=lambda v: set_text_content(v),
104
+ )
105
+ return current_placeholder, text_input
106
 
 
 
 
 
 
 
107
 
108
+ @app.cell
109
+ def _(current_placeholder, mo, set_text_content):
110
+ def apply_placeholder():
111
+ set_text_content(current_placeholder)
112
 
113
+ apply_placeholder_button = mo.ui.button(
114
+ label="Use Placeholder Text", on_click=lambda _: apply_placeholder()
 
 
 
 
 
 
 
 
 
 
115
  )
116
+ return (apply_placeholder_button,)
117
 
118
 
119
  @app.cell
120
+ def _(apply_placeholder_button, language_selector, mo, text_input):
121
+ mo.vstack(
122
+ [
123
+ text_input,
124
+ mo.hstack([language_selector, apply_placeholder_button], justify="start"),
125
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  )
127
  return
128
 
129
 
130
+ @app.cell
131
+ def _(get_text_content, language_selector, mo, nlp_en, nlp_ja):
132
+ # Analyze text using spaCy based on selected language
133
+ # Read text from state
134
+ current_text = get_text_content()
135
+ if language_selector.value == "English":
136
+ doc = nlp_en(current_text)
137
+ else:
138
+ doc = nlp_ja(current_text)
139
+
140
+ # Tokenized version and count
141
+ tokenized_text = [token.text for token in doc]
142
+ token_count = len(tokenized_text)
143
+
144
  mo.md(
145
+ f"**Tokenized Text:** {' | '.join(tokenized_text)}\n\n**Token Count:** {token_count}"
 
 
 
 
146
  )
147
+ return current_text, doc
148
 
149
 
150
+ @app.cell
151
+ def _(doc, mo, pl):
152
+ # Create a polars DataFrame with token attributes
153
+ token_data = pl.DataFrame(
154
  {
155
+ "Token": [token.text for token in doc],
156
+ "Lemma": [token.lemma_ for token in doc],
157
+ "POS": [token.pos_ for token in doc],
158
+ "Tag": [token.tag_ for token in doc],
159
+ "Morph": [
160
+ str(token.morph) for token in doc
161
+ ], # To be more precise, this should be merged back in via .to_dict()
162
+ "Token Position": list(range(len(doc))),
163
+ "Sentence Number": [i for i, sent in enumerate(doc.sents) for token in sent],
164
  }
165
  )
 
166
 
167
+ mo.ui.dataframe(token_data, page_size=50)
168
+ return (token_data,)
169
 
170
+
171
+ @app.cell
172
+ def _(mo):
173
+ # Create UI element for selecting the column to visualize
174
+ column_selector = mo.ui.dropdown(
175
+ options=["POS", "Tag", "Lemma", "Token", "Morph"],
176
+ value="POS",
177
+ label="Select column to visualize",
 
 
 
178
  )
 
179
 
180
+ column_selector
181
+ return (column_selector,)
182
 
 
 
 
 
 
183
 
184
+ @app.cell
185
+ def _(alt, column_selector, mo, token_data):
186
+ mo.stop(token_data.is_empty(), "Please set input text.")
187
+
188
+ selected_column = column_selector.value
189
+ # Calculate value counts for the selected column
190
+ counts_df = (
191
+ token_data[selected_column]
192
+ .value_counts()
193
+ .sort(by=["count", selected_column], descending=[True, False])
194
+ )
195
 
196
+ chart = (
197
+ alt.Chart(counts_df)
198
+ .mark_bar()
199
+ .encode(
200
+ x=alt.X("count", title="Frequency"),
201
+ y=alt.Y(selected_column, title=selected_column, sort=None),
202
+ tooltip=[selected_column, "count"],
203
+ )
204
+ .properties(title=f"{selected_column} Distribution")
205
+ .interactive()
206
  )
207
+ mo.ui.altair_chart(chart)
208
  return
209
 
210
 
211
  @app.cell
212
+ def _(llm_model_choices, mo):
213
+ # UI for selecting the LLM tokenizer model
214
+ llm_tokenizer_selector = mo.ui.dropdown(
215
+ options=llm_model_choices,
216
+ value=llm_model_choices[-1], # Default to gpt2 for faster loading initially
217
+ label="Select LLM Tokenizer Model",
218
+ )
219
+ llm_tokenizer_selector
220
+ return (llm_tokenizer_selector,)
221
 
222
 
223
  @app.cell
224
+ def _(AutoTokenizer, llm_tokenizer_selector):
225
+ # Load the selected tokenizer
226
+ # Adapted code from: https://huggingface.co/spaces/barttee/tokenizers/blob/main/app.py
227
+ # This cell will re-run when llm_tokenizer_selector.value changes
228
+ # Marimo caches the result implicitly based on inputs
229
+ selected_model_name = llm_tokenizer_selector.value
230
+ tokenizer = AutoTokenizer.from_pretrained(selected_model_name)
231
+ return (tokenizer,)
232
 
233
 
234
  @app.cell
235
+ def _(math):
236
+ # Function to calculate token statistics
237
+ def get_token_stats(tokens: list, original_text: str) -> dict:
238
+ """Calculate enhanced statistics about the tokens."""
239
+ if not tokens:
240
+ return { # Return default structure even for empty input
241
+ "basic_stats": {
242
+ "total_tokens": 0,
243
+ "unique_tokens": 0,
244
+ "compression_ratio": 0,
245
+ "space_tokens": 0,
246
+ "newline_tokens": 0,
247
+ "special_tokens": 0,
248
+ "punctuation_tokens": 0,
249
+ "unique_percentage": 0,
250
+ },
251
+ "length_stats": {
252
+ "avg_length": 0,
253
+ "std_dev": 0,
254
+ "min_length": 0,
255
+ "max_length": 0,
256
+ "median_length": 0,
257
+ },
258
+ }
259
+
260
+ total_tokens = len(tokens)
261
+ unique_tokens = len(set(tokens))
262
+ # Handle potential division by zero if total_tokens is 0 (already checked by `if not tokens`)
263
+ avg_length = (
264
+ sum(len(t) for t in tokens) / total_tokens if total_tokens > 0 else 0
265
+ )
266
+ # Handle potential division by zero if total_tokens is 0
267
+ compression_ratio = len(original_text) / total_tokens if total_tokens > 0 else 0
268
+
269
+ # Token type analysis (Note: Heuristics might vary between tokenizers)
270
+ # Using startswith(('Ġ', ' ')) covers common space markers like SentencePiece's U+2581 and BPE's 'Ġ'
271
+ space_tokens = sum(1 for t in tokens if t.startswith(("Ġ", " ")))
272
+ # Check for common newline representations
273
+ newline_tokens = sum(
274
+ 1 for t in tokens if "Ċ" in t or t == "\n" or t == "<0x0A>"
275
+ )
276
+ # A broader definition for special tokens based on common patterns (control tokens)
277
+ special_tokens = sum(
278
+ 1
279
+ for t in tokens
280
+ if (t.startswith("<") and t.endswith(">"))
281
+ or (t.startswith("[") and t.endswith("]"))
282
+ )
283
+ # Simple punctuation check (might overlap with other categories, focuses on single char punct)
284
+ punctuation_tokens = sum(
285
+ 1
286
+ for t in tokens
287
+ if len(t) == 1 and not t.isalnum() and t not in [" ", "\n", "Ġ", "Ċ"]
288
+ )
289
 
290
+ # Length distribution
291
+ lengths = [len(t) for t in tokens]
292
+ if not lengths: # Should not happen if tokens is not empty, but safe check
293
+ return {
294
+ "basic_stats": {
295
+ "total_tokens": 0,
296
+ "unique_tokens": 0,
297
+ "compression_ratio": 0,
298
+ "space_tokens": 0,
299
+ "newline_tokens": 0,
300
+ "special_tokens": 0,
301
+ "punctuation_tokens": 0,
302
+ "unique_percentage": 0,
303
+ },
304
+ "length_stats": {
305
+ "avg_length": 0,
306
+ "std_dev": 0,
307
+ "min_length": 0,
308
+ "max_length": 0,
309
+ "median_length": 0,
310
+ },
311
+ }
312
+
313
+ mean_length = sum(lengths) / len(lengths)
314
+ variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
315
+ std_dev = math.sqrt(variance)
316
+ sorted_lengths = sorted(lengths)
317
+ # Handle case where lengths list might be empty after filtering, though unlikely here
318
+ median_length = sorted_lengths[len(lengths) // 2] if lengths else 0
319
+
320
+ return {
321
+ "basic_stats": {
322
+ "total_tokens": total_tokens,
323
+ "unique_tokens": unique_tokens,
324
+ "compression_ratio": round(compression_ratio, 2),
325
+ "space_tokens": space_tokens,
326
+ "newline_tokens": newline_tokens,
327
+ "special_tokens": special_tokens,
328
+ "punctuation_tokens": punctuation_tokens,
329
+ "unique_percentage": round(unique_tokens / total_tokens * 100, 1)
330
+ if total_tokens > 0
331
+ else 0,
332
+ },
333
+ "length_stats": {
334
+ "avg_length": round(avg_length, 2),
335
+ "std_dev": round(std_dev, 2),
336
+ "min_length": min(lengths) if lengths else 0,
337
+ "max_length": max(lengths) if lengths else 0,
338
+ "median_length": median_length,
339
+ },
340
+ }
341
 
342
+ return (get_token_stats,)
 
 
 
343
 
344
 
345
  @app.cell
346
+ def _(hashlib):
347
+ def get_varied_color(token: str) -> dict:
348
+ """Generate vibrant colors with HSL for better visual distinction."""
349
+ # Use a fixed salt or seed if you want consistent colors across runs for the same token
350
+ token_hash = hashlib.md5(token.encode()).hexdigest()
351
+ hue = int(token_hash[:3], 16) % 360
352
+ saturation = 70 + (int(token_hash[3:5], 16) % 20) # Saturation between 70-90%
353
+ lightness = 80 + (
354
+ int(token_hash[5:7], 16) % 10
355
+ ) # Lightness between 80-90% (light background)
356
+ # Ensure text color contrasts well with the light background
357
+ text_lightness = 20 # Dark text for light background
358
+
359
+ return {
360
+ "background": f"hsl({hue}, {saturation}%, {lightness}%)",
361
+ "text": f"hsl({hue}, {saturation}%, {text_lightness}%)",
362
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
 
364
+ return (get_varied_color,)
365
+
366
+
367
+ @app.function
368
+ def fix_token(token: str) -> str:
369
+ """Fix token for display with improved space visualization."""
370
+ print(token)
371
+ # Replace SentencePiece space marker U+2581 with a middle dot
372
+ token = token.replace(" ", "·")
373
+ # Replace BPE space marker 'Ġ' with a middle dot
374
+ if token.startswith("Ġ"):
375
+ space_count = token.count("Ġ")
376
+ return "·" * space_count + token[space_count:]
377
+ # Replace newline markers for display
378
+ token = token.replace(
379
+ "Ċ", "↵\n"
380
+ ) # Replace newline marker with symbol and actual newline
381
+ token = token.replace("<0x0A>", "↵\n") # Handle byte representation of newline
382
+ return token
383
+
384
+
385
+ @app.function
386
+ def get_tokenizer_info(tokenizer):
387
+ """
388
+ Extract useful information from a tokenizer.
389
+ Returns a dictionary with tokenizer details.
390
+ """
391
+ print(tokenizer)
392
+
393
+ info = {}
394
+ try:
395
+ # Get vocabulary size (dictionary size)
396
+ if hasattr(tokenizer, "vocab_size"):
397
+ info["vocab_size"] = tokenizer.vocab_size
398
+ elif hasattr(tokenizer, "get_vocab"):
399
+ info["vocab_size"] = len(tokenizer.get_vocab())
400
+
401
+ # Get model max length if available
402
+ if (
403
+ hasattr(tokenizer, "model_max_length")
404
+ and tokenizer.model_max_length < 1000000
405
+ ): # Sanity check for realistic values
406
+ info["model_max_length"] = tokenizer.model_max_length
407
+ else:
408
+ info["model_max_length"] = "Not specified or very large"
409
+
410
+ # Check tokenizer type
411
+ info["tokenizer_type"] = tokenizer.__class__.__name__
412
+
413
+ # Get special tokens using the recommended attributes/methods
414
+ special_tokens = {}
415
+ # Prefer all_special_tokens if available
416
+ if hasattr(tokenizer, "all_special_tokens"):
417
+ for token in tokenizer.all_special_tokens:
418
+ # Try to find the attribute name corresponding to the token value
419
+ token_name = "unknown_special_token" # Default name
420
+ for attr_name in [
421
+ "pad_token",
422
+ "eos_token",
423
+ "bos_token",
424
+ "sep_token",
425
+ "cls_token",
426
+ "unk_token",
427
+ "mask_token",
428
+ ]:
429
+ if (
430
+ hasattr(tokenizer, attr_name)
431
+ and getattr(tokenizer, attr_name) == token
432
+ ):
433
+ token_name = attr_name
434
+ break
435
+ if token and str(token).strip():
436
+ special_tokens[token_name] = str(token)
437
+ else:
438
+ # Fallback to checking individual attributes
439
+ for token_name in [
440
+ "pad_token",
441
+ "eos_token",
442
+ "bos_token",
443
+ "sep_token",
444
+ "cls_token",
445
+ "unk_token",
446
+ "mask_token",
447
+ ]:
448
+ if (
449
+ hasattr(tokenizer, token_name)
450
+ and getattr(tokenizer, token_name) is not None
451
+ ):
452
+ token_value = getattr(tokenizer, token_name)
453
+ if token_value and str(token_value).strip():
454
+ special_tokens[token_name] = str(token_value)
455
+
456
+ info["special_tokens"] = special_tokens if special_tokens else "None found"
457
+
458
+ except Exception as e:
459
+ info["error"] = f"Error extracting tokenizer info: {str(e)}"
460
+
461
+ return info
462
 
 
463
 
464
+ @app.cell
465
+ def _(mo):
466
+ show_ids_switch = mo.ui.switch(label="Show Token IDs instead of Text", value=False)
467
+ return (show_ids_switch,)
468
 
 
 
469
 
470
+ @app.cell
471
+ def _(
472
+ current_text,
473
+ get_token_stats,
474
+ get_varied_color,
475
+ llm_tokenizer_selector,
476
+ mo,
477
+ show_ids_switch,
478
+ tokenizer,
479
+ ):
480
+ # --- Tokenization and Data Preparation ---
481
+
482
+ # Get tokenizer metadata
483
+ tokenizer_info = get_tokenizer_info(tokenizer)
484
+
485
+ # Tokenize the input text
486
+ # Use tokenize to get string representations for analysis and display
487
+ all_tokens = tokenizer.tokenize(current_text)
488
+ print(all_tokens)
489
+ total_token_count = len(all_tokens)
490
+
491
+ # Limit the number of tokens for display to avoid browser slowdown
492
+ display_limit = 1000
493
+ display_tokens = all_tokens[:display_limit]
494
+ display_limit_reached = total_token_count > display_limit
495
+
496
+ # Generate data for visualization
497
+ llm_token_data = []
498
+ for idx, token in enumerate(display_tokens):
499
+ colors = get_varied_color(token)
500
+ fixed_token_display = fix_token(token) # Apply fixes for display
501
+ # Handle potential errors during ID conversion (e.g., unknown tokens if not handled by tokenizer)
502
+ try:
503
+ token_id = tokenizer.convert_tokens_to_ids(token)
504
+ except KeyError:
505
+ token_id = (
506
+ tokenizer.unk_token_id if hasattr(tokenizer, "unk_token_id") else -1
507
+ ) # Use UNK id or -1
508
+
509
+ llm_token_data.append(
510
+ {
511
+ "original": token,
512
+ "display": fixed_token_display,
513
+ "colors": colors,
514
+ "is_newline": "↵"
515
+ in fixed_token_display, # Check if it represents a newline
516
+ "token_id": token_id,
517
+ "token_index": idx,
518
+ }
519
+ )
520
 
521
+ # Calculate statistics using the full token list
522
+ token_stats = get_token_stats(all_tokens, current_text)
523
+
524
+ # Construct HTML for colored tokens
525
+ html_parts = []
526
+ for item in llm_token_data:
527
+ # Use pre-wrap to respect spaces and newlines within the token display
528
+ style = f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;"
529
+ # Add title attribute for hover info (original token + ID)
530
+ title = f"Original: {item['original']}\nID: {item['token_id']}"
531
+ display_content = str(item["token_id"]) if show_ids_switch.value else item["display"]
532
+ html_parts.append(
533
+ f'<span style="{style}" title="{title}">{display_content}</span>'
534
+ )
535
 
536
+ token_viz_html = mo.Html(f'<div style="line-height: 1.6;">{"".join(html_parts)}</div>')
 
 
537
 
538
+ basic_stats = token_stats['basic_stats']
539
+ length_stats = token_stats['length_stats']
540
 
541
+ basic_stats_md = "**Basic Stats:**\n\n" + "\n".join(
542
+ f"- **{key.replace('_', ' ').title()}:** `{value}`"
543
+ for key, value in basic_stats.items()
544
+ )
545
 
546
+ length_stats_md = "**Length (Character) Stats:**\n\n" + "\n".join(
547
+ f"- **{key.replace('_', ' ').title()}:** `{value}`"
548
+ for key, value in length_stats.items()
549
+ )
550
 
551
+ mo.md(f"""# LLM tokenizer: {llm_tokenizer_selector.value}
552
 
553
+ {show_ids_switch}
 
 
 
 
 
 
 
 
 
554
 
555
+ ## Tokenizer output
556
 
557
+ {mo.as_html(token_viz_html)}
 
 
558
 
559
+ ## Token Statistics
 
 
 
 
560
 
561
+ {basic_stats_md}
562
 
563
+ {length_stats_md}
 
 
 
 
564
 
565
+ """)
 
 
566
  return
567
 
568
 
569
  @app.cell
570
+ def _():
 
 
 
 
 
 
 
571
  return
572
 
573
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
574
  if __name__ == "__main__":
575
  app.run()
pyproject.toml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "counting-words"
3
+ version = "0.1.0"
4
+ description = "Counting words in English and Japanese texts demo"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "marimo>=0.13.0",
9
+ "polars>=1.27.1",
10
+ "altair>=5.5.0",
11
+ "spacy>=3.8.5",
12
+ "en-core-web-md",
13
+ "ja-core-news-md",
14
+ "transformers>=4.51.3",
15
+ ]
16
+
17
+ [tool.uv.sources]
18
+ en-core-web-md = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl" }
19
+ ja-core-news-md = { url = "https://github.com/explosion/spacy-models/releases/download/ja_core_news_md-3.8.0/ja_core_news_md-3.8.0-py3-none-any.whl" }
requirements.txt DELETED
@@ -1,5 +0,0 @@
1
- marimo
2
- # Or a specific version
3
- # marimo>=0.9.0
4
-
5
- # Add other dependencies as needed