Spaces:

kishkath
/

bpe-tokenizer

Sleeping

App Files Files Community

kishkath commited on Jan 15

Commit

b115eae

verified ·

1 Parent(s): f53ddeb

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -13

app.py CHANGED Viewed

@@ -39,7 +39,7 @@ def encode_text(text, tokenizer):
     if not text.strip():
         return ("Please enter some Telugu text",
                 "No statistics available",
-                None)
     try:
         # Encode the text
@@ -63,32 +63,30 @@ def encode_text(text, tokenizer):
         # Create visualization data
         tokens = []
-        colors = []
         # Generate colors based on token frequencies
         unique_tokens = set(encoded)
-        color_map = {token: np.random.rand(3).tolist() for token in unique_tokens}
         for token_id in encoded:
             token_bytes = tokenizer.vocab[token_id]
             token_text = token_bytes.decode('utf-8', errors='replace')
-            tokens.append(token_text)
-            colors.append(color_map[token_id])
-        # Create visualization list for HighlightedText
-        visualization = [(token, color) for token, color in zip(tokens, colors)]
         return (
-            str(encoded),  # encoded_ids for the first textbox
-            stats,        # stats for the second textbox
-            visualization # for the HighlightedText component
         )
     except Exception as e:
         return (
             f"Error: {str(e)}",
             "Error occurred during encoding",
-            None
         )
 def decode_ids(encoded_ids_str, tokenizer):
@@ -183,7 +181,9 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
             gr.Markdown("### Token Visualization")
             token_viz = gr.HighlightedText(
                 label="Token Segmentation",
-                show_legend=True
             )
     with gr.Tab("Decoder"):

     if not text.strip():
         return ("Please enter some Telugu text",
                 "No statistics available",
+                [])  # Empty list for visualization
     try:
         # Encode the text
         # Create visualization data
         tokens = []
         # Generate colors based on token frequencies
         unique_tokens = set(encoded)
+        # Create color map with string hex colors instead of RGB lists
+        color_map = {token: f"#{hash(str(token)) % 0xFFFFFF:06x}" for token in unique_tokens}
+        # Create visualization list with proper format
+        visualization = []
         for token_id in encoded:
             token_bytes = tokenizer.vocab[token_id]
             token_text = token_bytes.decode('utf-8', errors='replace')
+            visualization.append((token_text, color_map[token_id]))
         return (
+            str(encoded),    # encoded_ids for the first textbox
+            stats,          # stats for the second textbox
+            visualization   # for the HighlightedText component
         )
     except Exception as e:
         return (
             f"Error: {str(e)}",
             "Error occurred during encoding",
+            []  # Empty list for visualization on error
         )
 def decode_ids(encoded_ids_str, tokenizer):
             gr.Markdown("### Token Visualization")
             token_viz = gr.HighlightedText(
                 label="Token Segmentation",
+                show_legend=True,
+                combine_adjacent=True,
+                color_map={}  # Let Gradio handle the color mapping
             )
     with gr.Tab("Decoder"):