bpe-tokenizer / app.py
kishkath's picture
Create app.py
bf87fbf verified
raw
history blame
6.72 kB
import gradio as gr
import json
import os
from tokenizers.basic import BasicTokenizer
import numpy as np
def load_tokenizer(model_path, vocab_path):
"""Load the trained tokenizer"""
tokenizer = BasicTokenizer()
try:
# Load the trained model
tokenizer.load(model_path)
# Load vocabulary
with open(vocab_path, 'r', encoding='utf-8') as f:
vocab_data = json.load(f)
tokenizer.token_to_id = {k: int(v) for k, v in vocab_data['token_to_id'].items()}
tokenizer.id_to_token = {int(k): v for k, v in vocab_data['id_to_token'].items()}
tokenizer.merges = {tuple(map(int, k.split(','))): int(v)
for k, v in vocab_data['merges'].items()}
return tokenizer
except Exception as e:
raise Exception(f"Error loading tokenizer: {e}")
def encode_text(text, tokenizer):
"""Encode text and return statistics"""
if not text.strip():
return {
"encoded_ids": "Please enter some Telugu text",
"stats": "No statistics available",
"visualization": None
}
try:
# Encode the text
encoded = tokenizer.encode(text)
# Calculate compression ratio
original_size = len(text.encode('utf-8'))
encoded_size = len(encoded) * 2
compression_ratio = original_size / encoded_size
# Prepare statistics
stats = f"""
πŸ“Š Encoding Statistics:
β€’ Original text length: {len(text)} characters
β€’ Encoded length: {len(encoded)} tokens
β€’ Compression ratio: {compression_ratio:.2f}X
β€’ Original size: {original_size} bytes
β€’ Encoded size: {encoded_size} bytes
β€’ Space saved: {(1 - encoded_size/original_size) * 100:.1f}%
"""
# Create token visualization
viz_data = visualize_encoding(text, encoded, tokenizer)
return {
"encoded_ids": str(encoded),
"stats": stats,
"visualization": viz_data
}
except Exception as e:
return {
"encoded_ids": f"Error: {str(e)}",
"stats": "Error occurred during encoding",
"visualization": None
}
def decode_ids(encoded_ids_str, tokenizer):
"""Decode the encoded IDs back to text"""
if not encoded_ids_str.strip():
return "Please enter encoded IDs"
try:
# Convert string representation of list to actual list of integers
encoded_ids = eval(encoded_ids_str)
if not isinstance(encoded_ids, list):
return "Invalid input: Please enter a list of integers"
# Decode the IDs
decoded_text = tokenizer.decode(encoded_ids)
return decoded_text
except Exception as e:
return f"Error during decoding: {str(e)}"
def visualize_encoding(text, encoded_ids, tokenizer):
"""Create a visual representation of the encoding"""
tokens = []
colors = []
# Generate colors based on token frequencies
unique_tokens = set(encoded_ids)
color_map = {token: np.random.rand(3).tolist() for token in unique_tokens}
for token_id in encoded_ids:
token_bytes = tokenizer.vocab[token_id]
token_text = token_bytes.decode('utf-8', errors='replace')
tokens.append(token_text)
colors.append(color_map[token_id])
return {
"tokens": tokens,
"colors": colors
}
# Load the tokenizer
model_path = "models/version_2/checkpoints/telugu_basic.model"
vocab_path = "models/version_2/vocabulary/vocabulary.json"
tokenizer = load_tokenizer(model_path, vocab_path)
# Create the Gradio interface
with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# πŸ”€ Telugu Text Tokenizer
This tool helps you encode Telugu text into tokens and decode them back.
It uses a trained BPE (Byte Pair Encoding) tokenizer optimized for Telugu language.
## Features:
- πŸ”„ Encode Telugu text to token IDs
- πŸ“Š View compression statistics
- 🎨 Visualize token segmentation
- ⚑ Fast and efficient encoding/decoding
""")
with gr.Tab("Encoder"):
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Enter Telugu Text",
placeholder="Type or paste Telugu text here...",
lines=5
)
encode_btn = gr.Button("πŸ”„ Encode", variant="primary")
with gr.Column():
encoded_output = gr.Textbox(
label="Encoded Token IDs",
lines=5,
interactive=False
)
stats_output = gr.Textbox(
label="Statistics",
lines=8,
interactive=False
)
with gr.Row():
gr.Markdown("### Token Visualization")
token_viz = gr.HighlightedText(
label="Token Segmentation",
show_legend=True
)
with gr.Tab("Decoder"):
with gr.Row():
with gr.Column():
encoded_input = gr.Textbox(
label="Enter Encoded Token IDs",
placeholder="Paste the encoded token IDs here...",
lines=5
)
decode_btn = gr.Button("πŸ”„ Decode", variant="primary")
with gr.Column():
decoded_output = gr.Textbox(
label="Decoded Telugu Text",
lines=5,
interactive=False
)
# Set up event handlers
encode_btn.click(
fn=lambda text: encode_text(text, tokenizer),
inputs=input_text,
outputs=[encoded_output, stats_output, token_viz]
)
decode_btn.click(
fn=lambda ids: decode_ids(ids, tokenizer),
inputs=encoded_input,
outputs=decoded_output
)
gr.Markdown("""
### πŸ“ Instructions:
1. **Encoding**: Enter Telugu text in the encoder tab and click "Encode"
2. **Decoding**: Copy the encoded IDs and paste them in the decoder tab
3. **Visualization**: View token segmentation with color coding
### ℹ️ Notes:
- The tokenizer uses BPE (Byte Pair Encoding) algorithm
- Compression ratio shows how efficiently the text is encoded
- Different colors in visualization represent different tokens
""")
# Launch the app
if __name__ == "__main__":
demo.launch()