Spaces:

kishkath
/

bpe-tokenizer

Sleeping

App Files Files Community

kishkath commited on Jan 18

Commit

d6e7dc9

verified ·

1 Parent(s): dcb5a2c

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -39

app.py CHANGED Viewed

@@ -39,7 +39,7 @@ def encode_text(text, tokenizer):
     if not text.strip():
         return ("Please enter some Telugu text",
                 "No statistics available",
-                [])  # Empty list for visualization
     try:
         # Encode the text
@@ -66,7 +66,7 @@ def encode_text(text, tokenizer):
         # Generate colors based on token frequencies
         unique_tokens = set(encoded)
-        # Create color map with string hex colors instead of RGB lists
         color_map = {token: f"#{hash(str(token)) % 0xFFFFFF:06x}" for token in unique_tokens}
         # Create visualization list with proper format
@@ -77,19 +77,19 @@ def encode_text(text, tokenizer):
             visualization.append((token_text, color_map[token_id]))
         return (
-            str(encoded),    # encoded_ids for the first textbox
-            stats,          # stats for the second textbox
-            visualization   # for the HighlightedText component
         )
     except Exception as e:
         return (
             f"Error: {str(e)}",
             "Error occurred during encoding",
-            []  # Empty list for visualization on error
         )
-def decode_ids(encoded_ids_str, tokenizer):
     """Decode the encoded IDs back to text"""
     if not encoded_ids_str.strip():
         return "Please enter encoded IDs"
@@ -106,27 +106,7 @@ def decode_ids(encoded_ids_str, tokenizer):
     except Exception as e:
         return f"Error during decoding: {str(e)}"
-def visualize_encoding(text, encoded_ids, tokenizer):
-    """Create a visual representation of the encoding"""
-    tokens = []
-    colors = []
-    # Generate colors based on token frequencies
-    unique_tokens = set(encoded_ids)
-    color_map = {token: np.random.rand(3).tolist() for token in unique_tokens}
-    for token_id in encoded_ids:
-        token_bytes = tokenizer.vocab[token_id]
-        token_text = token_bytes.decode('utf-8', errors='replace')
-        tokens.append(token_text)
-        colors.append(color_map[token_id])
-    return {
-        "tokens": tokens,
-        "colors": colors
-    }
-# Load the tokenizer with proper path handling
 try:
     model_path = os.path.join(current_dir, "models", "version_2", "checkpoints", "telugu_basic.model")
     vocab_path = os.path.join(current_dir, "models", "version_2", "vocabulary", "vocabulary.json")
@@ -140,7 +120,7 @@ except Exception as e:
     print(f"Error loading tokenizer: {str(e)}")
     raise
-# Add example inputs for the encoder
 encoder_examples = [
     ["తెలుగు భాష చాలా అందమైనది", "Basic sentence example"],
     ["నేను తెలుగు నేర్చుకుంటున్నాను", "Learning Telugu example"],
@@ -149,7 +129,6 @@ encoder_examples = [
     ["తెలుగు సాహిత్యం చాలా సమృద్ధిగా ఉంది", "Literature example"]
 ]
-# Add example inputs for the decoder
 decoder_examples = [
     ["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260]", "Basic sentence decoding"],
     ["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260, 287, 2206]", "Multiple tokens decoding"],
@@ -176,7 +155,8 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
                 input_text = gr.Textbox(
                     label="Enter Telugu Text",
                     placeholder="Type or paste Telugu text here...",
-                    lines=5
                 )
                 encode_btn = gr.Button("🔄 Encode", variant="primary")
@@ -193,15 +173,21 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
                 )
         with gr.Row():
-            gr.Markdown("### Token Visualization")
             token_viz = gr.HighlightedText(
                 label="Token Segmentation",
                 show_legend=True,
                 combine_adjacent=True,
                 color_map={}
             )
-        # Add examples for encoder
         gr.Examples(
             examples=encoder_examples,
             inputs=input_text,
@@ -217,7 +203,8 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
                 encoded_input = gr.Textbox(
                     label="Enter Encoded Token IDs",
                     placeholder="Paste the encoded token IDs here...",
-                    lines=5
                 )
                 decode_btn = gr.Button("🔄 Decode", variant="primary")
@@ -228,17 +215,23 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
                     interactive=False
                 )
-        # Add examples for decoder
         gr.Examples(
             examples=decoder_examples,
             inputs=encoded_input,
             outputs=decoded_output,
-            fn=lambda x: decode_ids(x, tokenizer),
             cache_examples=True,
             label="Token ID Examples"
         )
-    # Add more detailed instructions with examples
     gr.Markdown("""
     ### 📝 Instructions:
     1. **Encoding**:
@@ -269,7 +262,6 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
     - Typical compression ratios range from 3x to 4x
     """)
-    # Add a footer with version info
     gr.Markdown("""
     ---
     ### 📌 Version Information
@@ -278,7 +270,7 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
     - Last Updated: 2024
     """)
-# Launch the app with additional configurations
 if __name__ == "__main__":
     demo.launch(
         share=True,

     if not text.strip():
         return ("Please enter some Telugu text",
                 "No statistics available",
+                [])
     try:
         # Encode the text
         # Generate colors based on token frequencies
         unique_tokens = set(encoded)
+        # Create color map with string hex colors
         color_map = {token: f"#{hash(str(token)) % 0xFFFFFF:06x}" for token in unique_tokens}
         # Create visualization list with proper format
             visualization.append((token_text, color_map[token_id]))
         return (
+            str(encoded),
+            stats,
+            visualization
         )
     except Exception as e:
         return (
             f"Error: {str(e)}",
             "Error occurred during encoding",
+            []
         )
+def decode_ids(encoded_ids_str):
     """Decode the encoded IDs back to text"""
     if not encoded_ids_str.strip():
         return "Please enter encoded IDs"
     except Exception as e:
         return f"Error during decoding: {str(e)}"
+# Load the tokenizer
 try:
     model_path = os.path.join(current_dir, "models", "version_2", "checkpoints", "telugu_basic.model")
     vocab_path = os.path.join(current_dir, "models", "version_2", "vocabulary", "vocabulary.json")
     print(f"Error loading tokenizer: {str(e)}")
     raise
+# Example inputs
 encoder_examples = [
     ["తెలుగు భాష చాలా అందమైనది", "Basic sentence example"],
     ["నేను తెలుగు నేర్చుకుంటున్నాను", "Learning Telugu example"],
     ["తెలుగు సాహిత్యం చాలా సమృద్ధిగా ఉంది", "Literature example"]
 ]
 decoder_examples = [
     ["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260]", "Basic sentence decoding"],
     ["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260, 287, 2206]", "Multiple tokens decoding"],
                 input_text = gr.Textbox(
                     label="Enter Telugu Text",
                     placeholder="Type or paste Telugu text here...",
+                    lines=5,
+                    interactive=True
                 )
                 encode_btn = gr.Button("🔄 Encode", variant="primary")
                 )
         with gr.Row():
             token_viz = gr.HighlightedText(
                 label="Token Segmentation",
                 show_legend=True,
                 combine_adjacent=True,
                 color_map={}
             )
+        # Encoder button click event
+        encode_btn.click(
+            fn=lambda text: encode_text(text, tokenizer),
+            inputs=[input_text],
+            outputs=[encoded_output, stats_output, token_viz]
+        )
+        # Examples for encoder
         gr.Examples(
             examples=encoder_examples,
             inputs=input_text,
                 encoded_input = gr.Textbox(
                     label="Enter Encoded Token IDs",
                     placeholder="Paste the encoded token IDs here...",
+                    lines=5,
+                    interactive=True
                 )
                 decode_btn = gr.Button("🔄 Decode", variant="primary")
                     interactive=False
                 )
+        # Decoder button click event
+        decode_btn.click(
+            fn=decode_ids,
+            inputs=[encoded_input],
+            outputs=[decoded_output]
+        )
+        # Examples for decoder
         gr.Examples(
             examples=decoder_examples,
             inputs=encoded_input,
             outputs=decoded_output,
+            fn=decode_ids,
             cache_examples=True,
             label="Token ID Examples"
         )
     gr.Markdown("""
     ### 📝 Instructions:
     1. **Encoding**:
     - Typical compression ratios range from 3x to 4x
     """)
     gr.Markdown("""
     ---
     ### 📌 Version Information
     - Last Updated: 2024
     """)
+# Launch the app
 if __name__ == "__main__":
     demo.launch(
         share=True,