bpe-tokenizer / app.py
kishkath's picture
Update app.py
b0ed0a7 verified
import gradio as gr
import json
import os
import sys
import numpy as np
# Add the current directory to Python path
current_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(current_dir)
from tokenizers.basic import BasicTokenizer
def load_tokenizer(model_path, vocab_path):
"""Load the trained tokenizer"""
tokenizer = BasicTokenizer()
try:
# Check if paths exist
if not os.path.exists(model_path):
raise FileNotFoundError(f"Model file not found at: {model_path}")
if not os.path.exists(vocab_path):
raise FileNotFoundError(f"Vocabulary file not found at: {vocab_path}")
# Load the trained model
tokenizer.load(model_path)
# Load vocabulary
with open(vocab_path, 'r', encoding='utf-8') as f:
vocab_data = json.load(f)
tokenizer.token_to_id = {k: int(v) for k, v in vocab_data['token_to_id'].items()}
tokenizer.id_to_token = {int(k): v for k, v in vocab_data['id_to_token'].items()}
tokenizer.merges = {tuple(map(int, k.split(','))): int(v)
for k, v in vocab_data['merges'].items()}
return tokenizer
except Exception as e:
raise Exception(f"Error loading tokenizer: {str(e)}")
def encode_text(text, tokenizer):
"""Encode text and return statistics"""
if not text.strip():
return ("Please enter some Telugu text",
"No statistics available",
[])
try:
# Encode the text
encoded = tokenizer.encode(text)
# Calculate compression ratio
original_size = len(text.encode('utf-8'))
encoded_size = len(encoded) * 2
compression_ratio = original_size / encoded_size
# Prepare statistics
stats = f"""
📊 Encoding Statistics:
• Original text length: {len(text)} characters
• Encoded length: {len(encoded)} tokens
• Compression ratio: {compression_ratio:.2f}X
• Original size: {original_size} bytes
• Encoded size: {encoded_size} bytes
• Space saved: {(1 - encoded_size/original_size) * 100:.1f}%
"""
# Create visualization data
tokens = []
# Generate colors based on token frequencies
unique_tokens = set(encoded)
# Create color map with string hex colors
color_map = {token: f"#{hash(str(token)) % 0xFFFFFF:06x}" for token in unique_tokens}
# Create visualization list with proper format
visualization = []
for token_id in encoded:
token_bytes = tokenizer.vocab[token_id]
token_text = token_bytes.decode('utf-8', errors='replace')
visualization.append((token_text, color_map[token_id]))
return (
str(encoded),
stats,
visualization
)
except Exception as e:
return (
f"Error: {str(e)}",
"Error occurred during encoding",
[]
)
def decode_ids(encoded_ids_str):
"""Decode the encoded IDs back to text"""
if not encoded_ids_str.strip():
return "Please enter encoded IDs"
try:
# Convert string representation of list to actual list of integers
encoded_ids = eval(encoded_ids_str)
if not isinstance(encoded_ids, list):
return "Invalid input: Please enter a list of integers"
# Decode the IDs
decoded_text = tokenizer.decode(encoded_ids)
return decoded_text
except Exception as e:
return f"Error during decoding: {str(e)}"
# Load the tokenizer
try:
model_path = os.path.join(current_dir, "models", "version_2", "checkpoints", "telugu_basic.model")
vocab_path = os.path.join(current_dir, "models", "version_2", "vocabulary", "vocabulary.json")
print(f"Loading model from: {model_path}")
print(f"Loading vocabulary from: {vocab_path}")
tokenizer = load_tokenizer(model_path, vocab_path)
print("Tokenizer loaded successfully")
except Exception as e:
print(f"Error loading tokenizer: {str(e)}")
raise
# Example inputs
encoder_examples = [
["తెలుగు భాష చాలా అందమైనది", "Basic sentence example"],
["నేను తెలుగు నేర్చుకుంటున్నాను", "Learning Telugu example"],
["ప్రతి ఒక్కరూ సంతోషంగా ఉండాలి", "Happiness wish example"],
["అరణ్యంలో రాముడు అనేక రాక్షసులను సంహరిస్తాడు", "Complex sentence example"],
["తెలుగు సాహిత్యం చాలా సమృద్ధిగా ఉంది", "Literature example"]
]
decoder_examples = [
["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260]", "Basic sentence decoding"],
["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260, 287, 2206]", "Multiple tokens decoding"],
]
# Create the Gradio interface
with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 🔤 Telugu Text Tokenizer
This tool helps you encode Telugu text into tokens and decode them back.
It uses a trained BPE (Byte Pair Encoding) tokenizer optimized for Telugu language.
## Features:
- 🔄 Encode Telugu text to token IDs
- 📊 View compression statistics
- 🎨 Visualize token segmentation
- ⚡ Fast and efficient encoding/decoding
""")
with gr.Tab("Encoder"):
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Enter Telugu Text",
placeholder="Type or paste Telugu text here...",
lines=5,
interactive=True
)
encode_btn = gr.Button("🔄 Encode", variant="primary")
with gr.Column():
with gr.Row():
encoded_output = gr.Textbox(
label="Encoded Token IDs",
lines=5,
interactive=False,
show_copy_button=True
)
stats_output = gr.Textbox(
label="Statistics",
lines=8,
interactive=False
)
with gr.Row():
token_viz = gr.HighlightedText(
label="Token Segmentation",
show_legend=True,
combine_adjacent=True,
color_map={}
)
# Encoder button click event
encode_btn.click(
fn=lambda text: encode_text(text, tokenizer),
inputs=[input_text],
outputs=[encoded_output, stats_output, token_viz]
)
# Examples for encoder
gr.Examples(
examples=encoder_examples,
inputs=input_text,
outputs=[encoded_output, stats_output, token_viz],
fn=lambda x: encode_text(x, tokenizer),
cache_examples=True,
label="Telugu Text Examples"
)
with gr.Tab("Decoder"):
with gr.Row():
with gr.Column():
encoded_input = gr.Textbox(
label="Enter Encoded Token IDs",
placeholder="Paste the encoded token IDs here...",
lines=5,
interactive=True
)
decode_btn = gr.Button("🔄 Decode", variant="primary")
with gr.Column():
decoded_output = gr.Textbox(
label="Decoded Telugu Text",
lines=5,
interactive=False
)
# Decoder button click event
decode_btn.click(
fn=decode_ids,
inputs=[encoded_input],
outputs=[decoded_output]
)
# Examples for decoder
gr.Examples(
examples=decoder_examples,
inputs=encoded_input,
outputs=decoded_output,
fn=decode_ids,
cache_examples=True,
label="Token ID Examples"
)
gr.Markdown("""
### 📝 Instructions:
1. **Encoding**:
- Enter Telugu text in the encoder tab
- Click "Encode" to get token IDs and statistics
- Try the examples below to see how different texts are encoded
2. **Decoding**:
- Copy the encoded IDs from the encoder output
- Paste them in the decoder tab
- Click "Decode" to get back the original text
- Try the example token IDs to see how decoding works
3. **Visualization**:
- Each token is highlighted with a unique color
- Same tokens will have the same color
- Hover over tokens to see their IDs
### 🎯 Example Usage:
- Try encoding "తెలుగు" to see how basic words are tokenized
- Use longer sentences to see compression in action
- Copy encoded IDs and decode them back to verify accuracy
### ℹ️ Notes:
- The tokenizer uses BPE (Byte Pair Encoding) algorithm
- Compression ratio shows how efficiently the text is encoded
- Different colors in visualization represent different tokens
- Typical compression ratios range from 3x to 4x
""")
gr.Markdown("""
---
### 📌 Version Information
- Model Version: 2.0
- Vocabulary Size: 4800 tokens
- Last Updated: 2024
""")
# Launch the app
if __name__ == "__main__":
demo.launch(
share=True,
debug=True,
server_name="0.0.0.0",
server_port=7860,
show_error=True
)