Spaces:
Running
Running
Bor Hodošček
commited on
feat: remove debug
Browse files
app.py
CHANGED
@@ -160,7 +160,9 @@ def _(doc, mo, pl):
|
|
160 |
str(token.morph) for token in doc
|
161 |
], # To be more precise, this should be merged back in via .to_dict()
|
162 |
"Token Position": list(range(len(doc))),
|
163 |
-
"Sentence Number": [
|
|
|
|
|
164 |
}
|
165 |
)
|
166 |
|
@@ -367,7 +369,6 @@ def _(hashlib):
|
|
367 |
@app.function
|
368 |
def fix_token(token: str) -> str:
|
369 |
"""Fix token for display with improved space visualization."""
|
370 |
-
print(token)
|
371 |
# Replace SentencePiece space marker U+2581 with a middle dot
|
372 |
token = token.replace(" ", "·")
|
373 |
# Replace BPE space marker 'Ġ' with a middle dot
|
@@ -388,7 +389,6 @@ def get_tokenizer_info(tokenizer):
|
|
388 |
Extract useful information from a tokenizer.
|
389 |
Returns a dictionary with tokenizer details.
|
390 |
"""
|
391 |
-
print(tokenizer)
|
392 |
|
393 |
info = {}
|
394 |
try:
|
@@ -485,7 +485,6 @@ def _(
|
|
485 |
# Tokenize the input text
|
486 |
# Use tokenize to get string representations for analysis and display
|
487 |
all_tokens = tokenizer.tokenize(current_text)
|
488 |
-
print(all_tokens)
|
489 |
total_token_count = len(all_tokens)
|
490 |
|
491 |
# Limit the number of tokens for display to avoid browser slowdown
|
@@ -528,15 +527,19 @@ def _(
|
|
528 |
style = f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;"
|
529 |
# Add title attribute for hover info (original token + ID)
|
530 |
title = f"Original: {item['original']}\nID: {item['token_id']}"
|
531 |
-
display_content =
|
|
|
|
|
532 |
html_parts.append(
|
533 |
f'<span style="{style}" title="{title}">{display_content}</span>'
|
534 |
)
|
535 |
|
536 |
-
token_viz_html = mo.Html(
|
|
|
|
|
537 |
|
538 |
-
basic_stats = token_stats[
|
539 |
-
length_stats = token_stats[
|
540 |
|
541 |
basic_stats_md = "**Basic Stats:**\n\n" + "\n".join(
|
542 |
f"- **{key.replace('_', ' ').title()}:** `{value}`"
|
|
|
160 |
str(token.morph) for token in doc
|
161 |
], # To be more precise, this should be merged back in via .to_dict()
|
162 |
"Token Position": list(range(len(doc))),
|
163 |
+
"Sentence Number": [
|
164 |
+
i for i, sent in enumerate(doc.sents) for token in sent
|
165 |
+
],
|
166 |
}
|
167 |
)
|
168 |
|
|
|
369 |
@app.function
|
370 |
def fix_token(token: str) -> str:
|
371 |
"""Fix token for display with improved space visualization."""
|
|
|
372 |
# Replace SentencePiece space marker U+2581 with a middle dot
|
373 |
token = token.replace(" ", "·")
|
374 |
# Replace BPE space marker 'Ġ' with a middle dot
|
|
|
389 |
Extract useful information from a tokenizer.
|
390 |
Returns a dictionary with tokenizer details.
|
391 |
"""
|
|
|
392 |
|
393 |
info = {}
|
394 |
try:
|
|
|
485 |
# Tokenize the input text
|
486 |
# Use tokenize to get string representations for analysis and display
|
487 |
all_tokens = tokenizer.tokenize(current_text)
|
|
|
488 |
total_token_count = len(all_tokens)
|
489 |
|
490 |
# Limit the number of tokens for display to avoid browser slowdown
|
|
|
527 |
style = f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;"
|
528 |
# Add title attribute for hover info (original token + ID)
|
529 |
title = f"Original: {item['original']}\nID: {item['token_id']}"
|
530 |
+
display_content = (
|
531 |
+
str(item["token_id"]) if show_ids_switch.value else item["display"]
|
532 |
+
)
|
533 |
html_parts.append(
|
534 |
f'<span style="{style}" title="{title}">{display_content}</span>'
|
535 |
)
|
536 |
|
537 |
+
token_viz_html = mo.Html(
|
538 |
+
f'<div style="line-height: 1.6;">{"".join(html_parts)}</div>'
|
539 |
+
)
|
540 |
|
541 |
+
basic_stats = token_stats["basic_stats"]
|
542 |
+
length_stats = token_stats["length_stats"]
|
543 |
|
544 |
basic_stats_md = "**Basic Stats:**\n\n" + "\n".join(
|
545 |
f"- **{key.replace('_', ' ').title()}:** `{value}`"
|