Spaces:
Running
Running
Bor Hodošček
commited on
fix: bpe byte display and misc display tweaks
Browse files
app.py
CHANGED
@@ -13,30 +13,42 @@ app = marimo.App(width="medium")
|
|
13 |
def _():
|
14 |
import hashlib
|
15 |
import math
|
|
|
|
|
16 |
|
17 |
import altair as alt
|
18 |
import marimo as mo
|
19 |
import polars as pl
|
20 |
import spacy
|
21 |
-
|
|
|
|
|
|
|
|
|
22 |
|
23 |
# Load spaCy models for English and Japanese
|
24 |
-
nlp_en = spacy.load("en_core_web_md")
|
25 |
-
nlp_ja = spacy.load("ja_core_news_md")
|
26 |
|
27 |
# List of tokenizer models
|
28 |
-
llm_model_choices = [
|
29 |
-
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
30 |
"google/gemma-3-27b-it",
|
31 |
-
"
|
32 |
-
"
|
33 |
-
"
|
|
|
|
|
|
|
34 |
"google-bert/bert-large-uncased",
|
35 |
-
"openai-community/gpt2",
|
36 |
]
|
37 |
-
|
38 |
return (
|
|
|
39 |
AutoTokenizer,
|
|
|
|
|
|
|
|
|
40 |
alt,
|
41 |
hashlib,
|
42 |
llm_model_choices,
|
@@ -45,18 +57,23 @@ def _():
|
|
45 |
nlp_en,
|
46 |
nlp_ja,
|
47 |
pl,
|
|
|
|
|
48 |
)
|
49 |
|
50 |
|
51 |
@app.cell
|
52 |
def _(mo):
|
53 |
-
mo.md("# Tokenization for English and Japanese")
|
54 |
return
|
55 |
|
56 |
|
57 |
@app.cell
|
58 |
-
def _(mo):
|
59 |
# Central state for the text input content
|
|
|
|
|
|
|
60 |
get_text_content, set_text_content = mo.state("")
|
61 |
return get_text_content, set_text_content
|
62 |
|
@@ -73,7 +90,7 @@ def _(mo):
|
|
73 |
""".strip()
|
74 |
|
75 |
# Create UI element for language selection
|
76 |
-
language_selector = mo.ui.radio(
|
77 |
options=["English", "Japanese"], value="English", label="Language"
|
78 |
)
|
79 |
|
@@ -91,29 +108,30 @@ def _(
|
|
91 |
set_text_content,
|
92 |
):
|
93 |
# Define text_input dynamically based on language
|
94 |
-
current_placeholder = (
|
95 |
en_placeholder if language_selector.value == "English" else ja_placeholder
|
96 |
)
|
97 |
-
text_input = mo.ui.text_area(
|
98 |
-
# Read value from state
|
99 |
value=get_text_content(),
|
100 |
label="Enter text",
|
101 |
placeholder=current_placeholder,
|
102 |
full_width=True,
|
103 |
-
# Update state on user input
|
104 |
on_change=lambda v: set_text_content(v),
|
105 |
)
|
|
|
106 |
return current_placeholder, text_input
|
107 |
|
108 |
|
109 |
@app.cell
|
110 |
-
def _(current_placeholder, mo, set_text_content):
|
111 |
-
|
|
|
112 |
set_text_content(current_placeholder)
|
113 |
|
114 |
-
apply_placeholder_button = mo.ui.button(
|
115 |
label="Use Placeholder Text", on_click=lambda _: apply_placeholder()
|
116 |
)
|
|
|
117 |
return (apply_placeholder_button,)
|
118 |
|
119 |
|
@@ -129,37 +147,41 @@ def _(apply_placeholder_button, language_selector, mo, text_input):
|
|
129 |
|
130 |
|
131 |
@app.cell
|
132 |
-
def _(get_text_content, language_selector, mo, nlp_en, nlp_ja):
|
133 |
# Analyze text using spaCy based on selected language
|
134 |
-
|
135 |
-
|
136 |
if language_selector.value == "English":
|
137 |
doc = nlp_en(current_text)
|
138 |
else:
|
139 |
doc = nlp_ja(current_text)
|
|
|
|
|
|
|
|
|
|
|
140 |
|
141 |
-
|
142 |
-
|
143 |
-
token_count = len(tokenized_text)
|
144 |
|
145 |
mo.md(
|
146 |
-
f"**Tokenized Text:** {' | '.join(tokenized_text)}\n\n**Token Count:** {token_count}"
|
147 |
)
|
148 |
return current_text, doc
|
149 |
|
150 |
|
151 |
@app.cell
|
152 |
def _(doc, mo, pl):
|
153 |
-
|
154 |
-
token_data = pl.DataFrame(
|
155 |
{
|
156 |
"Token": [token.text for token in doc],
|
157 |
"Lemma": [token.lemma_ for token in doc],
|
158 |
"POS": [token.pos_ for token in doc],
|
159 |
"Tag": [token.tag_ for token in doc],
|
160 |
-
"Morph": [
|
161 |
-
|
162 |
-
|
|
|
163 |
"Token Position": list(range(len(doc))),
|
164 |
"Sentence Number": [
|
165 |
i for i, sent in enumerate(doc.sents) for token in sent
|
@@ -173,9 +195,8 @@ def _(doc, mo, pl):
|
|
173 |
|
174 |
@app.cell
|
175 |
def _(mo):
|
176 |
-
|
177 |
-
|
178 |
-
options=["POS", "Tag", "Lemma", "Token", "Morph"],
|
179 |
value="POS",
|
180 |
label="Select column to visualize",
|
181 |
)
|
@@ -185,18 +206,18 @@ def _(mo):
|
|
185 |
|
186 |
|
187 |
@app.cell
|
188 |
-
def _(alt, column_selector, mo, token_data):
|
189 |
mo.stop(token_data.is_empty(), "Please set input text.")
|
190 |
|
191 |
-
selected_column = column_selector.value
|
192 |
# Calculate value counts for the selected column
|
193 |
-
counts_df = (
|
194 |
token_data[selected_column]
|
195 |
.value_counts()
|
196 |
.sort(by=["count", selected_column], descending=[True, False])
|
197 |
)
|
198 |
|
199 |
-
chart = (
|
200 |
alt.Chart(counts_df)
|
201 |
.mark_bar()
|
202 |
.encode(
|
@@ -213,10 +234,9 @@ def _(alt, column_selector, mo, token_data):
|
|
213 |
|
214 |
@app.cell
|
215 |
def _(llm_model_choices, mo):
|
216 |
-
|
217 |
-
llm_tokenizer_selector = mo.ui.dropdown(
|
218 |
options=llm_model_choices,
|
219 |
-
value=llm_model_choices[
|
220 |
label="Select LLM Tokenizer Model",
|
221 |
)
|
222 |
llm_tokenizer_selector
|
@@ -224,101 +244,92 @@ def _(llm_model_choices, mo):
|
|
224 |
|
225 |
|
226 |
@app.cell
|
227 |
-
def _(AutoTokenizer, llm_tokenizer_selector):
|
228 |
-
# Load the selected tokenizer
|
229 |
# Adapted code from: https://huggingface.co/spaces/barttee/tokenizers/blob/main/app.py
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
return (tokenizer,)
|
235 |
|
236 |
|
237 |
@app.cell
|
238 |
-
def _(math):
|
239 |
-
|
240 |
-
|
|
|
241 |
"""Calculate enhanced statistics about the tokens."""
|
242 |
if not tokens:
|
243 |
-
|
|
|
244 |
"basic_stats": {
|
245 |
"total_tokens": 0,
|
246 |
"unique_tokens": 0,
|
247 |
-
"compression_ratio": 0,
|
248 |
"space_tokens": 0,
|
249 |
"newline_tokens": 0,
|
250 |
"special_tokens": 0,
|
251 |
"punctuation_tokens": 0,
|
252 |
-
"unique_percentage": 0,
|
253 |
},
|
254 |
"length_stats": {
|
255 |
-
"avg_length": 0,
|
256 |
-
"std_dev": 0,
|
257 |
"min_length": 0,
|
258 |
"max_length": 0,
|
259 |
-
"median_length": 0,
|
260 |
},
|
261 |
}
|
262 |
|
263 |
-
total_tokens = len(tokens)
|
264 |
-
unique_tokens = len(set(tokens))
|
265 |
-
|
266 |
-
|
267 |
-
sum(len(t) for t in tokens) / total_tokens if total_tokens > 0 else 0
|
268 |
)
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
# Token type analysis (Note: Heuristics might vary between tokenizers)
|
273 |
-
# Using startswith(('Ġ', ' ')) covers common space markers like SentencePiece's U+2581 and BPE's 'Ġ'
|
274 |
-
space_tokens = sum(1 for t in tokens if t.startswith(("Ġ", " ")))
|
275 |
-
# Check for common newline representations
|
276 |
-
newline_tokens = sum(
|
277 |
1 for t in tokens if "Ċ" in t or t == "\n" or t == "<0x0A>"
|
278 |
)
|
279 |
-
|
280 |
-
special_tokens = sum(
|
281 |
1
|
282 |
for t in tokens
|
283 |
if (t.startswith("<") and t.endswith(">"))
|
284 |
or (t.startswith("[") and t.endswith("]"))
|
285 |
)
|
286 |
-
|
287 |
-
punctuation_tokens = sum(
|
288 |
1
|
289 |
for t in tokens
|
290 |
if len(t) == 1 and not t.isalnum() and t not in [" ", "\n", "Ġ", "Ċ"]
|
291 |
)
|
292 |
|
293 |
-
|
294 |
-
lengths = [len(t) for t in tokens]
|
295 |
if not lengths: # Should not happen if tokens is not empty, but safe check
|
296 |
-
return {
|
297 |
"basic_stats": {
|
298 |
"total_tokens": 0,
|
299 |
"unique_tokens": 0,
|
300 |
-
"compression_ratio": 0,
|
301 |
"space_tokens": 0,
|
302 |
"newline_tokens": 0,
|
303 |
"special_tokens": 0,
|
304 |
"punctuation_tokens": 0,
|
305 |
-
"unique_percentage": 0,
|
306 |
},
|
307 |
"length_stats": {
|
308 |
-
"avg_length": 0,
|
309 |
-
"std_dev": 0,
|
310 |
"min_length": 0,
|
311 |
"max_length": 0,
|
312 |
-
"median_length": 0,
|
313 |
},
|
314 |
}
|
315 |
|
316 |
-
mean_length = sum(lengths) / len(lengths)
|
317 |
-
variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
|
318 |
-
std_dev = math.sqrt(variance)
|
319 |
-
sorted_lengths = sorted(lengths)
|
320 |
-
|
321 |
-
median_length = sorted_lengths[len(lengths) // 2] if lengths else 0
|
322 |
|
323 |
return {
|
324 |
"basic_stats": {
|
@@ -331,13 +342,13 @@ def _(math):
|
|
331 |
"punctuation_tokens": punctuation_tokens,
|
332 |
"unique_percentage": round(unique_tokens / total_tokens * 100, 1)
|
333 |
if total_tokens > 0
|
334 |
-
else 0,
|
335 |
},
|
336 |
"length_stats": {
|
337 |
-
"avg_length": round(
|
338 |
"std_dev": round(std_dev, 2),
|
339 |
-
"min_length": min(lengths)
|
340 |
-
"max_length": max(lengths)
|
341 |
"median_length": median_length,
|
342 |
},
|
343 |
}
|
@@ -347,17 +358,13 @@ def _(math):
|
|
347 |
|
348 |
@app.cell
|
349 |
def _(hashlib):
|
350 |
-
def get_varied_color(token: str) -> dict:
|
351 |
"""Generate vibrant colors with HSL for better visual distinction."""
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
int(token_hash[5:7], 16) % 10
|
358 |
-
) # Lightness between 80-90% (light background)
|
359 |
-
# Ensure text color contrasts well with the light background
|
360 |
-
text_lightness = 20 # Dark text for light background
|
361 |
|
362 |
return {
|
363 |
"background": f"hsl({hue}, {saturation}%, {lightness}%)",
|
@@ -368,76 +375,67 @@ def _(hashlib):
|
|
368 |
|
369 |
|
370 |
@app.function
|
371 |
-
def fix_token(
|
372 |
-
|
373 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
374 |
token = token.replace(" ", "·")
|
|
|
375 |
# Replace BPE space marker 'Ġ' with a middle dot
|
376 |
if token.startswith("Ġ"):
|
377 |
space_count = token.count("Ġ")
|
|
|
378 |
return "·" * space_count + token[space_count:]
|
|
|
379 |
# Replace newline markers for display
|
380 |
-
token = token.replace(
|
381 |
-
|
382 |
-
|
383 |
-
|
|
|
|
|
384 |
return token
|
385 |
|
386 |
|
387 |
-
@app.
|
388 |
-
def
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
# Prefer all_special_tokens if available
|
417 |
-
if hasattr(tokenizer, "all_special_tokens"):
|
418 |
-
for token in tokenizer.all_special_tokens:
|
419 |
-
# Try to find the attribute name corresponding to the token value
|
420 |
-
token_name = "unknown_special_token" # Default name
|
421 |
-
for attr_name in [
|
422 |
-
"pad_token",
|
423 |
-
"eos_token",
|
424 |
-
"bos_token",
|
425 |
-
"sep_token",
|
426 |
-
"cls_token",
|
427 |
-
"unk_token",
|
428 |
-
"mask_token",
|
429 |
-
]:
|
430 |
-
if (
|
431 |
-
hasattr(tokenizer, attr_name)
|
432 |
-
and getattr(tokenizer, attr_name) == token
|
433 |
-
):
|
434 |
-
token_name = attr_name
|
435 |
-
break
|
436 |
-
if token and str(token).strip():
|
437 |
-
special_tokens[token_name] = str(token)
|
438 |
-
else:
|
439 |
-
# Fallback to checking individual attributes
|
440 |
-
for token_name in [
|
441 |
"pad_token",
|
442 |
"eos_token",
|
443 |
"bos_token",
|
@@ -445,129 +443,259 @@ def get_tokenizer_info(tokenizer):
|
|
445 |
"cls_token",
|
446 |
"unk_token",
|
447 |
"mask_token",
|
448 |
-
]
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
453 |
token_value = getattr(tokenizer, token_name)
|
454 |
-
if
|
|
|
|
|
|
|
|
|
455 |
special_tokens[token_name] = str(token_value)
|
|
|
456 |
|
457 |
-
|
458 |
|
459 |
-
|
460 |
-
|
461 |
|
462 |
-
|
|
|
|
|
463 |
|
464 |
|
465 |
@app.cell
|
466 |
def _(mo):
|
467 |
-
show_ids_switch = mo.ui.switch(
|
|
|
|
|
468 |
return (show_ids_switch,)
|
469 |
|
470 |
|
471 |
@app.cell
|
472 |
def _(
|
|
|
|
|
|
|
473 |
current_text,
|
|
|
474 |
get_token_stats,
|
|
|
475 |
get_varied_color,
|
476 |
llm_tokenizer_selector,
|
477 |
mo,
|
|
|
478 |
show_ids_switch,
|
479 |
tokenizer,
|
480 |
):
|
481 |
-
#
|
|
|
482 |
|
483 |
# Get tokenizer metadata
|
484 |
-
tokenizer_info = get_tokenizer_info(tokenizer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
485 |
|
486 |
-
#
|
487 |
-
# Use tokenize to get string representations for analysis and display
|
488 |
-
all_tokens = tokenizer.tokenize(current_text)
|
489 |
-
total_token_count = len(all_tokens)
|
490 |
|
491 |
-
# Limit the number of tokens for display
|
492 |
-
display_limit = 1000
|
493 |
-
|
494 |
-
|
|
|
|
|
495 |
|
496 |
# Generate data for visualization
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
508 |
|
509 |
llm_token_data.append(
|
510 |
{
|
511 |
-
"original":
|
512 |
-
"display": fixed_token_display,
|
513 |
"colors": colors,
|
514 |
-
"is_newline": "↵"
|
515 |
-
in fixed_token_display, # Check if it represents a newline
|
516 |
"token_id": token_id,
|
517 |
"token_index": idx,
|
|
|
518 |
}
|
519 |
)
|
520 |
|
521 |
-
# Calculate statistics using the
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
537 |
|
538 |
-
token_viz_html = mo.Html(
|
539 |
f'<div style="line-height: 1.6;">{"".join(html_parts)}</div>'
|
540 |
)
|
541 |
|
542 |
-
|
543 |
-
|
|
|
|
|
|
|
544 |
|
545 |
-
|
|
|
|
|
|
|
|
|
|
|
546 |
f"- **{key.replace('_', ' ').title()}:** `{value}`"
|
547 |
for key, value in basic_stats.items()
|
548 |
)
|
549 |
|
550 |
-
length_stats_md = "**Length (Character) Stats:**\n\n" + "\n".join(
|
551 |
f"- **{key.replace('_', ' ').title()}:** `{value}`"
|
552 |
for key, value in length_stats.items()
|
553 |
)
|
554 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
555 |
mo.md(f"""# LLM tokenizer: {llm_tokenizer_selector.value}
|
556 |
|
|
|
|
|
|
|
557 |
{show_ids_switch}
|
558 |
|
559 |
## Tokenizer output
|
560 |
-
|
561 |
{mo.as_html(token_viz_html)}
|
562 |
|
563 |
## Token Statistics
|
|
|
564 |
|
565 |
{basic_stats_md}
|
566 |
|
567 |
{length_stats_md}
|
568 |
|
569 |
""")
|
570 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
571 |
|
572 |
|
573 |
@app.cell
|
|
|
13 |
def _():
|
14 |
import hashlib
|
15 |
import math
|
16 |
+
import re
|
17 |
+
from typing import Any, Callable, Optional, Union
|
18 |
|
19 |
import altair as alt
|
20 |
import marimo as mo
|
21 |
import polars as pl
|
22 |
import spacy
|
23 |
+
import spacy.language
|
24 |
+
from transformers import (
|
25 |
+
AutoTokenizer,
|
26 |
+
PreTrainedTokenizerBase,
|
27 |
+
)
|
28 |
|
29 |
# Load spaCy models for English and Japanese
|
30 |
+
nlp_en: spacy.language.Language = spacy.load("en_core_web_md")
|
31 |
+
nlp_ja: spacy.language.Language = spacy.load("ja_core_news_md")
|
32 |
|
33 |
# List of tokenizer models
|
34 |
+
llm_model_choices: list[str] = [
|
35 |
+
# "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
36 |
"google/gemma-3-27b-it",
|
37 |
+
"ibm-granite/granite-3.3-8b-instruct",
|
38 |
+
"shisa-ai/shisa-v2-qwen2.5-7b",
|
39 |
+
# "deepseek-ai/DeepSeek-R1",
|
40 |
+
# "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
|
41 |
+
# "Qwen/Qwen2.5-72B-Instruct",
|
42 |
+
# "openai-community/gpt2",
|
43 |
"google-bert/bert-large-uncased",
|
|
|
44 |
]
|
|
|
45 |
return (
|
46 |
+
Any,
|
47 |
AutoTokenizer,
|
48 |
+
Callable,
|
49 |
+
Optional,
|
50 |
+
PreTrainedTokenizerBase,
|
51 |
+
Union,
|
52 |
alt,
|
53 |
hashlib,
|
54 |
llm_model_choices,
|
|
|
57 |
nlp_en,
|
58 |
nlp_ja,
|
59 |
pl,
|
60 |
+
re,
|
61 |
+
spacy,
|
62 |
)
|
63 |
|
64 |
|
65 |
@app.cell
|
66 |
def _(mo):
|
67 |
+
mo.md("""# Tokenization for English and Japanese""")
|
68 |
return
|
69 |
|
70 |
|
71 |
@app.cell
|
72 |
+
def _(Callable, mo):
|
73 |
# Central state for the text input content
|
74 |
+
# Type the getter and setter
|
75 |
+
get_text_content: Callable[[], str]
|
76 |
+
set_text_content: Callable[[str], None]
|
77 |
get_text_content, set_text_content = mo.state("")
|
78 |
return get_text_content, set_text_content
|
79 |
|
|
|
90 |
""".strip()
|
91 |
|
92 |
# Create UI element for language selection
|
93 |
+
language_selector: mo.ui.radio = mo.ui.radio(
|
94 |
options=["English", "Japanese"], value="English", label="Language"
|
95 |
)
|
96 |
|
|
|
108 |
set_text_content,
|
109 |
):
|
110 |
# Define text_input dynamically based on language
|
111 |
+
current_placeholder: str = (
|
112 |
en_placeholder if language_selector.value == "English" else ja_placeholder
|
113 |
)
|
114 |
+
text_input: mo.ui.text_area = mo.ui.text_area(
|
|
|
115 |
value=get_text_content(),
|
116 |
label="Enter text",
|
117 |
placeholder=current_placeholder,
|
118 |
full_width=True,
|
|
|
119 |
on_change=lambda v: set_text_content(v),
|
120 |
)
|
121 |
+
# Type the return tuple
|
122 |
return current_placeholder, text_input
|
123 |
|
124 |
|
125 |
@app.cell
|
126 |
+
def _(Callable, current_placeholder, mo, set_text_content):
|
127 |
+
# Type the inner function
|
128 |
+
def apply_placeholder() -> None:
|
129 |
set_text_content(current_placeholder)
|
130 |
|
131 |
+
apply_placeholder_button: mo.ui.button = mo.ui.button(
|
132 |
label="Use Placeholder Text", on_click=lambda _: apply_placeholder()
|
133 |
)
|
134 |
+
# Type the return tuple
|
135 |
return (apply_placeholder_button,)
|
136 |
|
137 |
|
|
|
147 |
|
148 |
|
149 |
@app.cell
|
150 |
+
def _(get_text_content, language_selector, mo, nlp_en, nlp_ja, spacy):
|
151 |
# Analyze text using spaCy based on selected language
|
152 |
+
current_text: str = get_text_content()
|
153 |
+
doc: spacy.tokens.Doc
|
154 |
if language_selector.value == "English":
|
155 |
doc = nlp_en(current_text)
|
156 |
else:
|
157 |
doc = nlp_ja(current_text)
|
158 |
+
model_name: str = (
|
159 |
+
nlp_en.meta["name"]
|
160 |
+
if language_selector.value == "English"
|
161 |
+
else nlp_ja.meta["name"]
|
162 |
+
)
|
163 |
|
164 |
+
tokenized_text: list[str] = [token.text for token in doc]
|
165 |
+
token_count: int = len(tokenized_text)
|
|
|
166 |
|
167 |
mo.md(
|
168 |
+
f"**Tokenized Text using spaCy {'en_' if language_selector.value == 'English' else 'ja_'}{model_name}:** {' | '.join(tokenized_text)}\n\n**Token Count:** {token_count}"
|
169 |
)
|
170 |
return current_text, doc
|
171 |
|
172 |
|
173 |
@app.cell
|
174 |
def _(doc, mo, pl):
|
175 |
+
token_data: pl.DataFrame = pl.DataFrame(
|
|
|
176 |
{
|
177 |
"Token": [token.text for token in doc],
|
178 |
"Lemma": [token.lemma_ for token in doc],
|
179 |
"POS": [token.pos_ for token in doc],
|
180 |
"Tag": [token.tag_ for token in doc],
|
181 |
+
"Morph": [str(token.morph) for token in doc],
|
182 |
+
"OOV": [
|
183 |
+
token.is_oov for token in doc
|
184 |
+
], # FIXME: How to get .is_oov() from sudachi directly? This only works for English now...
|
185 |
"Token Position": list(range(len(doc))),
|
186 |
"Sentence Number": [
|
187 |
i for i, sent in enumerate(doc.sents) for token in sent
|
|
|
195 |
|
196 |
@app.cell
|
197 |
def _(mo):
|
198 |
+
column_selector: mo.ui.dropdown = mo.ui.dropdown(
|
199 |
+
options=["POS", "Tag", "Lemma", "Token", "Morph", "OOV"],
|
|
|
200 |
value="POS",
|
201 |
label="Select column to visualize",
|
202 |
)
|
|
|
206 |
|
207 |
|
208 |
@app.cell
|
209 |
+
def _(alt, column_selector, mo, pl, token_data):
|
210 |
mo.stop(token_data.is_empty(), "Please set input text.")
|
211 |
|
212 |
+
selected_column: str = column_selector.value
|
213 |
# Calculate value counts for the selected column
|
214 |
+
counts_df: pl.DataFrame = (
|
215 |
token_data[selected_column]
|
216 |
.value_counts()
|
217 |
.sort(by=["count", selected_column], descending=[True, False])
|
218 |
)
|
219 |
|
220 |
+
chart: alt.Chart = (
|
221 |
alt.Chart(counts_df)
|
222 |
.mark_bar()
|
223 |
.encode(
|
|
|
234 |
|
235 |
@app.cell
|
236 |
def _(llm_model_choices, mo):
|
237 |
+
llm_tokenizer_selector: mo.ui.dropdown = mo.ui.dropdown(
|
|
|
238 |
options=llm_model_choices,
|
239 |
+
value=llm_model_choices[0],
|
240 |
label="Select LLM Tokenizer Model",
|
241 |
)
|
242 |
llm_tokenizer_selector
|
|
|
244 |
|
245 |
|
246 |
@app.cell
|
247 |
+
def _(AutoTokenizer, PreTrainedTokenizerBase, llm_tokenizer_selector):
|
|
|
248 |
# Adapted code from: https://huggingface.co/spaces/barttee/tokenizers/blob/main/app.py
|
249 |
+
selected_model_name: str = llm_tokenizer_selector.value
|
250 |
+
tokenizer: PreTrainedTokenizerBase = AutoTokenizer.from_pretrained(
|
251 |
+
selected_model_name
|
252 |
+
)
|
253 |
return (tokenizer,)
|
254 |
|
255 |
|
256 |
@app.cell
|
257 |
+
def _(Union, math):
|
258 |
+
TokenStatsDict = dict[str, dict[str, Union[int, float]]]
|
259 |
+
|
260 |
+
def get_token_stats(tokens: list[str], original_text: str) -> TokenStatsDict:
|
261 |
"""Calculate enhanced statistics about the tokens."""
|
262 |
if not tokens:
|
263 |
+
# Return default structure matching TokenStatsDict
|
264 |
+
return {
|
265 |
"basic_stats": {
|
266 |
"total_tokens": 0,
|
267 |
"unique_tokens": 0,
|
268 |
+
"compression_ratio": 0.0,
|
269 |
"space_tokens": 0,
|
270 |
"newline_tokens": 0,
|
271 |
"special_tokens": 0,
|
272 |
"punctuation_tokens": 0,
|
273 |
+
"unique_percentage": 0.0,
|
274 |
},
|
275 |
"length_stats": {
|
276 |
+
"avg_length": 0.0,
|
277 |
+
"std_dev": 0.0,
|
278 |
"min_length": 0,
|
279 |
"max_length": 0,
|
280 |
+
"median_length": 0.0,
|
281 |
},
|
282 |
}
|
283 |
|
284 |
+
total_tokens: int = len(tokens)
|
285 |
+
unique_tokens: int = len(set(tokens))
|
286 |
+
compression_ratio: float = (
|
287 |
+
len(original_text) / total_tokens if total_tokens > 0 else 0.0
|
|
|
288 |
)
|
289 |
+
|
290 |
+
space_tokens: int = sum(1 for t in tokens if t.startswith(("Ġ", " ")))
|
291 |
+
newline_tokens: int = sum(
|
|
|
|
|
|
|
|
|
|
|
292 |
1 for t in tokens if "Ċ" in t or t == "\n" or t == "<0x0A>"
|
293 |
)
|
294 |
+
special_tokens: int = sum(
|
|
|
295 |
1
|
296 |
for t in tokens
|
297 |
if (t.startswith("<") and t.endswith(">"))
|
298 |
or (t.startswith("[") and t.endswith("]"))
|
299 |
)
|
300 |
+
punctuation_tokens: int = sum(
|
|
|
301 |
1
|
302 |
for t in tokens
|
303 |
if len(t) == 1 and not t.isalnum() and t not in [" ", "\n", "Ġ", "Ċ"]
|
304 |
)
|
305 |
|
306 |
+
lengths: list[int] = [len(t) for t in tokens]
|
|
|
307 |
if not lengths: # Should not happen if tokens is not empty, but safe check
|
308 |
+
return { # Return default structure matching TokenStatsDict
|
309 |
"basic_stats": {
|
310 |
"total_tokens": 0,
|
311 |
"unique_tokens": 0,
|
312 |
+
"compression_ratio": 0.0,
|
313 |
"space_tokens": 0,
|
314 |
"newline_tokens": 0,
|
315 |
"special_tokens": 0,
|
316 |
"punctuation_tokens": 0,
|
317 |
+
"unique_percentage": 0.0,
|
318 |
},
|
319 |
"length_stats": {
|
320 |
+
"avg_length": 0.0,
|
321 |
+
"std_dev": 0.0,
|
322 |
"min_length": 0,
|
323 |
"max_length": 0,
|
324 |
+
"median_length": 0.0,
|
325 |
},
|
326 |
}
|
327 |
|
328 |
+
mean_length: float = sum(lengths) / len(lengths)
|
329 |
+
variance: float = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
|
330 |
+
std_dev: float = math.sqrt(variance)
|
331 |
+
sorted_lengths: list[int] = sorted(lengths)
|
332 |
+
median_length: float = float(sorted_lengths[len(lengths) // 2])
|
|
|
333 |
|
334 |
return {
|
335 |
"basic_stats": {
|
|
|
342 |
"punctuation_tokens": punctuation_tokens,
|
343 |
"unique_percentage": round(unique_tokens / total_tokens * 100, 1)
|
344 |
if total_tokens > 0
|
345 |
+
else 0.0,
|
346 |
},
|
347 |
"length_stats": {
|
348 |
+
"avg_length": round(mean_length, 2),
|
349 |
"std_dev": round(std_dev, 2),
|
350 |
+
"min_length": min(lengths),
|
351 |
+
"max_length": max(lengths),
|
352 |
"median_length": median_length,
|
353 |
},
|
354 |
}
|
|
|
358 |
|
359 |
@app.cell
|
360 |
def _(hashlib):
|
361 |
+
def get_varied_color(token: str) -> dict[str, str]:
|
362 |
"""Generate vibrant colors with HSL for better visual distinction."""
|
363 |
+
token_hash: str = hashlib.md5(token.encode()).hexdigest()
|
364 |
+
hue: int = int(token_hash[:3], 16) % 360
|
365 |
+
saturation: int = 70 + (int(token_hash[3:5], 16) % 20)
|
366 |
+
lightness: int = 80 + (int(token_hash[5:7], 16) % 10)
|
367 |
+
text_lightness: int = 20
|
|
|
|
|
|
|
|
|
368 |
|
369 |
return {
|
370 |
"background": f"hsl({hue}, {saturation}%, {lightness}%)",
|
|
|
375 |
|
376 |
|
377 |
@app.function
|
378 |
+
def fix_token(
|
379 |
+
token: str, re
|
380 |
+
) -> (
|
381 |
+
str
|
382 |
+
): # re module type is complex, leave as Any implicitly or import types.ModuleType
|
383 |
+
"""Fix token for display, handling byte fallbacks and spaces."""
|
384 |
+
# Check for byte fallback pattern <0xHH> using a full match
|
385 |
+
byte_match = re.fullmatch(r"<0x([0-9A-Fa-f]{2})>", token)
|
386 |
+
if byte_match:
|
387 |
+
hex_value = byte_match.group(1).upper()
|
388 |
+
# Return a clear representation indicating it's a byte
|
389 |
+
return f"<0x{hex_value}>"
|
390 |
+
|
391 |
+
# Replace SentencePiece space marker U+2581 (' ') with a middle dot
|
392 |
token = token.replace(" ", "·")
|
393 |
+
|
394 |
# Replace BPE space marker 'Ġ' with a middle dot
|
395 |
if token.startswith("Ġ"):
|
396 |
space_count = token.count("Ġ")
|
397 |
+
# Ensure we only replace the leading 'Ġ' markers
|
398 |
return "·" * space_count + token[space_count:]
|
399 |
+
|
400 |
# Replace newline markers for display
|
401 |
+
token = token.replace("Ċ", "↵\n")
|
402 |
+
# Handle byte representation of newline AFTER general byte check
|
403 |
+
# This specific check might become redundant if <0x0A> is caught by the byte_match above
|
404 |
+
# Keep it for now as a fallback.
|
405 |
+
token = token.replace("<0x0A>", "↵\n")
|
406 |
+
|
407 |
return token
|
408 |
|
409 |
|
410 |
+
@app.cell
|
411 |
+
def _(Any, PreTrainedTokenizerBase):
|
412 |
+
def get_tokenizer_info(
|
413 |
+
tokenizer: PreTrainedTokenizerBase,
|
414 |
+
) -> dict[str, Any]:
|
415 |
+
"""
|
416 |
+
Extract useful information from a tokenizer.
|
417 |
+
Returns a dictionary with tokenizer details.
|
418 |
+
"""
|
419 |
+
info: dict[str, Any] = {}
|
420 |
+
try:
|
421 |
+
if hasattr(tokenizer, "vocab_size"):
|
422 |
+
info["vocab_size"] = tokenizer.vocab_size
|
423 |
+
elif hasattr(tokenizer, "get_vocab"):
|
424 |
+
info["vocab_size"] = len(tokenizer.get_vocab())
|
425 |
+
|
426 |
+
if (
|
427 |
+
hasattr(tokenizer, "model_max_length")
|
428 |
+
and isinstance(tokenizer.model_max_length, int)
|
429 |
+
and tokenizer.model_max_length < 1000000
|
430 |
+
):
|
431 |
+
info["model_max_length"] = tokenizer.model_max_length
|
432 |
+
else:
|
433 |
+
info["model_max_length"] = "Not specified or very large"
|
434 |
+
|
435 |
+
info["tokenizer_type"] = tokenizer.__class__.__name__
|
436 |
+
|
437 |
+
special_tokens: dict[str, str] = {}
|
438 |
+
special_token_attributes: list[str] = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
439 |
"pad_token",
|
440 |
"eos_token",
|
441 |
"bos_token",
|
|
|
443 |
"cls_token",
|
444 |
"unk_token",
|
445 |
"mask_token",
|
446 |
+
]
|
447 |
+
|
448 |
+
processed_tokens: set[str] = (
|
449 |
+
set()
|
450 |
+
) # Keep track of processed tokens to avoid duplicates
|
451 |
+
|
452 |
+
# Prefer all_special_tokens if available
|
453 |
+
if hasattr(tokenizer, "all_special_tokens"):
|
454 |
+
for token_value in tokenizer.all_special_tokens:
|
455 |
+
if (
|
456 |
+
not token_value
|
457 |
+
or not str(token_value).strip()
|
458 |
+
or str(token_value) in processed_tokens
|
459 |
+
):
|
460 |
+
continue
|
461 |
+
|
462 |
+
token_name = "special_token" # Default name
|
463 |
+
# Find the attribute name corresponding to the token value
|
464 |
+
for attr_name in special_token_attributes:
|
465 |
+
if (
|
466 |
+
hasattr(tokenizer, attr_name)
|
467 |
+
and getattr(tokenizer, attr_name) == token_value
|
468 |
+
):
|
469 |
+
token_name = attr_name
|
470 |
+
break
|
471 |
+
special_tokens[token_name] = str(token_value)
|
472 |
+
processed_tokens.add(str(token_value))
|
473 |
+
|
474 |
+
# Fallback/Augment with individual attributes if not covered by all_special_tokens
|
475 |
+
for token_name in special_token_attributes:
|
476 |
+
if hasattr(tokenizer, token_name):
|
477 |
token_value = getattr(tokenizer, token_name)
|
478 |
+
if (
|
479 |
+
token_value
|
480 |
+
and str(token_value).strip()
|
481 |
+
and str(token_value) not in processed_tokens
|
482 |
+
):
|
483 |
special_tokens[token_name] = str(token_value)
|
484 |
+
processed_tokens.add(str(token_value))
|
485 |
|
486 |
+
info["special_tokens"] = special_tokens if special_tokens else "None found"
|
487 |
|
488 |
+
except Exception as e:
|
489 |
+
info["error"] = f"Error extracting tokenizer info: {str(e)}"
|
490 |
|
491 |
+
return info
|
492 |
+
|
493 |
+
return (get_tokenizer_info,)
|
494 |
|
495 |
|
496 |
@app.cell
|
497 |
def _(mo):
|
498 |
+
show_ids_switch: mo.ui.switch = mo.ui.switch(
|
499 |
+
label="Show token IDs instead of text", value=False
|
500 |
+
)
|
501 |
return (show_ids_switch,)
|
502 |
|
503 |
|
504 |
@app.cell
|
505 |
def _(
|
506 |
+
Any,
|
507 |
+
Optional,
|
508 |
+
Union,
|
509 |
current_text,
|
510 |
+
fix_token,
|
511 |
get_token_stats,
|
512 |
+
get_tokenizer_info,
|
513 |
get_varied_color,
|
514 |
llm_tokenizer_selector,
|
515 |
mo,
|
516 |
+
re,
|
517 |
show_ids_switch,
|
518 |
tokenizer,
|
519 |
):
|
520 |
+
# Define the Unicode replacement character
|
521 |
+
REPLACEMENT_CHARACTER = "\ufffd"
|
522 |
|
523 |
# Get tokenizer metadata
|
524 |
+
tokenizer_info: dict[str, Any] = get_tokenizer_info(tokenizer)
|
525 |
+
|
526 |
+
# 1. Encode text to get token IDs first.
|
527 |
+
token_ids: list[int] = tokenizer.encode(current_text, add_special_tokens=False)
|
528 |
+
# 2. Decode each token ID individually.
|
529 |
+
# We will check for REPLACEMENT_CHARACTER later.
|
530 |
+
all_decoded_tokens: list[str] = [
|
531 |
+
tokenizer.decode(
|
532 |
+
[token_id], skip_special_tokens=False, clean_up_tokenization_spaces=False
|
533 |
+
)
|
534 |
+
for token_id in token_ids
|
535 |
+
]
|
536 |
|
537 |
+
total_token_count: int = len(token_ids) # Count based on IDs
|
|
|
|
|
|
|
538 |
|
539 |
+
# Limit the number of tokens for display
|
540 |
+
display_limit: int = 1000
|
541 |
+
# Limit consistently using token IDs and the decoded tokens
|
542 |
+
display_token_ids: list[int] = token_ids[:display_limit]
|
543 |
+
display_decoded_tokens: list[str] = all_decoded_tokens[:display_limit]
|
544 |
+
display_limit_reached: bool = total_token_count > display_limit
|
545 |
|
546 |
# Generate data for visualization
|
547 |
+
TokenVisData = dict[str, Union[str, int, bool, dict[str, str]]]
|
548 |
+
llm_token_data: list[TokenVisData] = []
|
549 |
+
|
550 |
+
# Use zip for parallel iteration
|
551 |
+
for idx, (token_id, token_str) in enumerate(
|
552 |
+
zip(display_token_ids, display_decoded_tokens)
|
553 |
+
):
|
554 |
+
colors: dict[str, str] = get_varied_color(
|
555 |
+
token_str
|
556 |
+
if REPLACEMENT_CHARACTER not in token_str
|
557 |
+
else f"invalid_{token_id}"
|
558 |
+
) # Color based on string or ID if invalid
|
559 |
+
|
560 |
+
is_invalid_utf8 = REPLACEMENT_CHARACTER in token_str
|
561 |
+
fixed_token_display: str
|
562 |
+
original_for_title: str = (
|
563 |
+
token_str # Store the potentially problematic string for title
|
564 |
+
)
|
565 |
+
|
566 |
+
if is_invalid_utf8:
|
567 |
+
# If decode failed, show a representation with the hex ID
|
568 |
+
fixed_token_display = f"<0x{token_id:X}>"
|
569 |
+
else:
|
570 |
+
# If decode succeeded, apply standard fixes
|
571 |
+
fixed_token_display = fix_token(token_str, re)
|
572 |
|
573 |
llm_token_data.append(
|
574 |
{
|
575 |
+
"original": original_for_title, # Store the raw decoded string (might contain �)
|
576 |
+
"display": fixed_token_display, # Store the cleaned/invalid representation
|
577 |
"colors": colors,
|
578 |
+
"is_newline": "↵" in fixed_token_display, # Check the display version
|
|
|
579 |
"token_id": token_id,
|
580 |
"token_index": idx,
|
581 |
+
"is_invalid": is_invalid_utf8, # Add flag for potential styling/title changes
|
582 |
}
|
583 |
)
|
584 |
|
585 |
+
# Calculate statistics using the list of *successfully* decoded token strings
|
586 |
+
# We might want to reconsider what `all_tokens` means for stats if many are invalid.
|
587 |
+
# For now, let's use the potentially problematic strings, as stats are mostly length/count based.
|
588 |
+
token_stats: dict[str, dict[str, Union[int, float]]] = get_token_stats(
|
589 |
+
all_decoded_tokens,
|
590 |
+
current_text, # Pass the full list from decode()
|
591 |
+
)
|
592 |
+
|
593 |
+
# Construct HTML for colored tokens using list comprehension (functional style)
|
594 |
+
html_parts: list[str] = [
|
595 |
+
(
|
596 |
+
lambda item: (
|
597 |
+
style
|
598 |
+
:= f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;"
|
599 |
+
# Add specific style for invalid tokens if needed
|
600 |
+
+ (" border: 1px solid red;" if item.get("is_invalid") else ""),
|
601 |
+
# Modify title based on validity
|
602 |
+
title := (
|
603 |
+
f"Original: {item['original']}\nID: {item['token_id']}"
|
604 |
+
+ ("\n(Invalid UTF-8)" if item.get("is_invalid") else "")
|
605 |
+
+ ("\n(Byte Token)" if item["display"].startswith("byte[") else "")
|
606 |
+
),
|
607 |
+
display_content := str(item["token_id"])
|
608 |
+
if show_ids_switch.value
|
609 |
+
else item["display"],
|
610 |
+
f'<span style="{style}" title="{title}">{display_content}</span>',
|
611 |
+
)[-1] # Get the last element (the formatted string) from the lambda's tuple
|
612 |
+
)(item)
|
613 |
+
for item in llm_token_data
|
614 |
+
]
|
615 |
|
616 |
+
token_viz_html: mo.Html = mo.Html(
|
617 |
f'<div style="line-height: 1.6;">{"".join(html_parts)}</div>'
|
618 |
)
|
619 |
|
620 |
+
# Optional: Add a warning if the display limit was reached
|
621 |
+
limit_warning: Optional[mo.md] = None # Use Optional type
|
622 |
+
if display_limit_reached:
|
623 |
+
limit_warning = mo.md(f"""**Warning:** Displaying only the first {display_limit:,} tokens out of {total_token_count:,}.
|
624 |
+
Statistics are calculated on the full text.""").callout(kind="warn")
|
625 |
|
626 |
+
# Use dict access safely with .get() for stats
|
627 |
+
basic_stats: dict[str, Union[int, float]] = token_stats.get("basic_stats", {})
|
628 |
+
length_stats: dict[str, Union[int, float]] = token_stats.get("length_stats", {})
|
629 |
+
|
630 |
+
# Use list comprehensions for markdown generation (functional style)
|
631 |
+
basic_stats_md: str = "**Basic Stats:**\n\n" + "\n".join(
|
632 |
f"- **{key.replace('_', ' ').title()}:** `{value}`"
|
633 |
for key, value in basic_stats.items()
|
634 |
)
|
635 |
|
636 |
+
length_stats_md: str = "**Length (Character) Stats:**\n\n" + "\n".join(
|
637 |
f"- **{key.replace('_', ' ').title()}:** `{value}`"
|
638 |
for key, value in length_stats.items()
|
639 |
)
|
640 |
|
641 |
+
# Build tokenizer info markdown parts
|
642 |
+
tokenizer_info_md_parts: list[str] = [
|
643 |
+
f"**Tokenizer Type:** `{tokenizer_info.get('tokenizer_type', 'N/A')}`"
|
644 |
+
]
|
645 |
+
if vocab_size := tokenizer_info.get("vocab_size"):
|
646 |
+
tokenizer_info_md_parts.append(f"**Vocab Size:** `{vocab_size:,}`")
|
647 |
+
if max_len := tokenizer_info.get("model_max_length"):
|
648 |
+
tokenizer_info_md_parts.append(f"**Model Max Length:** `{max_len}`")
|
649 |
+
|
650 |
+
special_tokens_info = tokenizer_info.get("special_tokens")
|
651 |
+
if isinstance(special_tokens_info, dict) and special_tokens_info:
|
652 |
+
tokenizer_info_md_parts.append("**Special Tokens:**")
|
653 |
+
tokenizer_info_md_parts.extend(
|
654 |
+
f" - `{name}`: `{str(val)}`" for name, val in special_tokens_info.items()
|
655 |
+
)
|
656 |
+
elif isinstance(special_tokens_info, str): # Handle "None found" case
|
657 |
+
tokenizer_info_md_parts.append(f"**Special Tokens:** `{special_tokens_info}`")
|
658 |
+
|
659 |
+
if error_info := tokenizer_info.get("error"):
|
660 |
+
tokenizer_info_md_parts.append(f"**Info Error:** `{error_info}`")
|
661 |
+
|
662 |
+
tokenizer_info_md: str = "\n\n".join(tokenizer_info_md_parts)
|
663 |
+
|
664 |
+
# Display the final markdown output
|
665 |
mo.md(f"""# LLM tokenizer: {llm_tokenizer_selector.value}
|
666 |
|
667 |
+
## Tokenizer Info
|
668 |
+
{tokenizer_info_md}
|
669 |
+
|
670 |
{show_ids_switch}
|
671 |
|
672 |
## Tokenizer output
|
673 |
+
{limit_warning if limit_warning else ""}
|
674 |
{mo.as_html(token_viz_html)}
|
675 |
|
676 |
## Token Statistics
|
677 |
+
(Calculated on full text if truncated above)
|
678 |
|
679 |
{basic_stats_md}
|
680 |
|
681 |
{length_stats_md}
|
682 |
|
683 |
""")
|
684 |
+
|
685 |
+
return (
|
686 |
+
all_decoded_tokens,
|
687 |
+
token_ids,
|
688 |
+
basic_stats_md,
|
689 |
+
display_limit_reached,
|
690 |
+
length_stats_md,
|
691 |
+
limit_warning,
|
692 |
+
llm_token_data,
|
693 |
+
token_stats,
|
694 |
+
token_viz_html,
|
695 |
+
tokenizer_info,
|
696 |
+
tokenizer_info_md,
|
697 |
+
total_token_count,
|
698 |
+
)
|
699 |
|
700 |
|
701 |
@app.cell
|