Spaces:

bor
/

counting_words

Running

App Files Files Community

Bor Hodošček commited on 7 days ago

Commit

d84cd62

unverified ·

1 Parent(s): 4d52104

feat: remove debug

Browse files

Files changed (1) hide show

app.py +11 -8

app.py CHANGED Viewed

@@ -160,7 +160,9 @@ def _(doc, mo, pl):
                 str(token.morph) for token in doc
             ],  # To be more precise, this should be merged back in via .to_dict()
             "Token Position": list(range(len(doc))),
-            "Sentence Number": [i for i, sent in enumerate(doc.sents) for token in sent],
         }
     )
@@ -367,7 +369,6 @@ def _(hashlib):
 @app.function
 def fix_token(token: str) -> str:
     """Fix token for display with improved space visualization."""
-    print(token)
     # Replace SentencePiece space marker U+2581 with a middle dot
     token = token.replace(" ", "·")
     # Replace BPE space marker 'Ġ' with a middle dot
@@ -388,7 +389,6 @@ def get_tokenizer_info(tokenizer):
     Extract useful information from a tokenizer.
     Returns a dictionary with tokenizer details.
     """
-    print(tokenizer)
     info = {}
     try:
@@ -485,7 +485,6 @@ def _(
     # Tokenize the input text
     # Use tokenize to get string representations for analysis and display
     all_tokens = tokenizer.tokenize(current_text)
-    print(all_tokens)
     total_token_count = len(all_tokens)
     # Limit the number of tokens for display to avoid browser slowdown
@@ -528,15 +527,19 @@ def _(
         style = f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;"
         # Add title attribute for hover info (original token + ID)
         title = f"Original: {item['original']}\nID: {item['token_id']}"
-        display_content = str(item["token_id"]) if show_ids_switch.value else item["display"]
         html_parts.append(
             f'<span style="{style}" title="{title}">{display_content}</span>'
         )
-    token_viz_html = mo.Html(f'<div style="line-height: 1.6;">{"".join(html_parts)}</div>')
-    basic_stats = token_stats['basic_stats']
-    length_stats = token_stats['length_stats']
     basic_stats_md = "**Basic Stats:**\n\n" + "\n".join(
         f"-   **{key.replace('_', ' ').title()}:** `{value}`"

                 str(token.morph) for token in doc
             ],  # To be more precise, this should be merged back in via .to_dict()
             "Token Position": list(range(len(doc))),
+            "Sentence Number": [
+                i for i, sent in enumerate(doc.sents) for token in sent
+            ],
         }
     )
 @app.function
 def fix_token(token: str) -> str:
     """Fix token for display with improved space visualization."""
     # Replace SentencePiece space marker U+2581 with a middle dot
     token = token.replace(" ", "·")
     # Replace BPE space marker 'Ġ' with a middle dot
     Extract useful information from a tokenizer.
     Returns a dictionary with tokenizer details.
     """
     info = {}
     try:
     # Tokenize the input text
     # Use tokenize to get string representations for analysis and display
     all_tokens = tokenizer.tokenize(current_text)
     total_token_count = len(all_tokens)
     # Limit the number of tokens for display to avoid browser slowdown
         style = f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;"
         # Add title attribute for hover info (original token + ID)
         title = f"Original: {item['original']}\nID: {item['token_id']}"
+        display_content = (
+            str(item["token_id"]) if show_ids_switch.value else item["display"]
+        )
         html_parts.append(
             f'<span style="{style}" title="{title}">{display_content}</span>'
         )
+    token_viz_html = mo.Html(
+        f'<div style="line-height: 1.6;">{"".join(html_parts)}</div>'
+    )
+    basic_stats = token_stats["basic_stats"]
+    length_stats = token_stats["length_stats"]
     basic_stats_md = "**Basic Stats:**\n\n" + "\n".join(
         f"-   **{key.replace('_', ' ').title()}:** `{value}`"