Spaces:

kishkath
/

bpe-tokenizer

Sleeping

App Files Files Community

kishkath commited on Jan 15

Commit

f53ddeb

verified ·

1 Parent(s): 5740893

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -19

app.py CHANGED Viewed

@@ -37,11 +37,9 @@ def load_tokenizer(model_path, vocab_path):
 def encode_text(text, tokenizer):
     """Encode text and return statistics"""
     if not text.strip():
-        return {
-            "encoded_ids": "Please enter some Telugu text",
-            "stats": "No statistics available",
-            "visualization": None
-        }
     try:
         # Encode the text
@@ -63,20 +61,35 @@ def encode_text(text, tokenizer):
         • Space saved: {(1 - encoded_size/original_size) * 100:.1f}%
         """
-        # Create token visualization
-        viz_data = visualize_encoding(text, encoded, tokenizer)
-        return {
-            "encoded_ids": str(encoded),
-            "stats": stats,
-            "visualization": viz_data
-        }
     except Exception as e:
-        return {
-            "encoded_ids": f"Error: {str(e)}",
-            "stats": "Error occurred during encoding",
-            "visualization": None
-        }
 def decode_ids(encoded_ids_str, tokenizer):
     """Decode the encoded IDs back to text"""
@@ -192,8 +205,8 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
     # Set up event handlers
     encode_btn.click(
-        fn=lambda text: encode_text(text, tokenizer),
-        inputs=input_text,
         outputs=[encoded_output, stats_output, token_viz]
     )

 def encode_text(text, tokenizer):
     """Encode text and return statistics"""
     if not text.strip():
+        return ("Please enter some Telugu text",
+                "No statistics available",
+                None)
     try:
         # Encode the text
         • Space saved: {(1 - encoded_size/original_size) * 100:.1f}%
         """
+        # Create visualization data
+        tokens = []
+        colors = []
+        # Generate colors based on token frequencies
+        unique_tokens = set(encoded)
+        color_map = {token: np.random.rand(3).tolist() for token in unique_tokens}
+        for token_id in encoded:
+            token_bytes = tokenizer.vocab[token_id]
+            token_text = token_bytes.decode('utf-8', errors='replace')
+            tokens.append(token_text)
+            colors.append(color_map[token_id])
+        # Create visualization list for HighlightedText
+        visualization = [(token, color) for token, color in zip(tokens, colors)]
+        return (
+            str(encoded),  # encoded_ids for the first textbox
+            stats,        # stats for the second textbox
+            visualization # for the HighlightedText component
+        )
     except Exception as e:
+        return (
+            f"Error: {str(e)}",
+            "Error occurred during encoding",
+            None
+        )
 def decode_ids(encoded_ids_str, tokenizer):
     """Decode the encoded IDs back to text"""
     # Set up event handlers
     encode_btn.click(
+        fn=encode_text,  # Now using the function directly
+        inputs=[input_text, gr.State(tokenizer)],  # Pass tokenizer as state
         outputs=[encoded_output, stats_output, token_viz]
     )