Spaces:
Sleeping
Sleeping
File size: 9,939 Bytes
bf87fbf 2eb6bed bf87fbf 2eb6bed bf87fbf 2eb6bed bf87fbf 2eb6bed bf87fbf f53ddeb d6e7dc9 bf87fbf f53ddeb d6e7dc9 b115eae f53ddeb b115eae f53ddeb b115eae f53ddeb d6e7dc9 f53ddeb bf87fbf f53ddeb d6e7dc9 f53ddeb bf87fbf d6e7dc9 bf87fbf d6e7dc9 2eb6bed bf87fbf d6e7dc9 dcb5a2c bf87fbf d6e7dc9 bf87fbf b0ed0a7 bf87fbf b115eae dcb5a2c bf87fbf d6e7dc9 dcb5a2c d6e7dc9 dcb5a2c bf87fbf d6e7dc9 bf87fbf dcb5a2c d6e7dc9 dcb5a2c d6e7dc9 dcb5a2c bf87fbf dcb5a2c bf87fbf dcb5a2c bf87fbf d6e7dc9 bf87fbf 2eb6bed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 |
import gradio as gr
import json
import os
import sys
import numpy as np
# Add the current directory to Python path
current_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(current_dir)
from tokenizers.basic import BasicTokenizer
def load_tokenizer(model_path, vocab_path):
"""Load the trained tokenizer"""
tokenizer = BasicTokenizer()
try:
# Check if paths exist
if not os.path.exists(model_path):
raise FileNotFoundError(f"Model file not found at: {model_path}")
if not os.path.exists(vocab_path):
raise FileNotFoundError(f"Vocabulary file not found at: {vocab_path}")
# Load the trained model
tokenizer.load(model_path)
# Load vocabulary
with open(vocab_path, 'r', encoding='utf-8') as f:
vocab_data = json.load(f)
tokenizer.token_to_id = {k: int(v) for k, v in vocab_data['token_to_id'].items()}
tokenizer.id_to_token = {int(k): v for k, v in vocab_data['id_to_token'].items()}
tokenizer.merges = {tuple(map(int, k.split(','))): int(v)
for k, v in vocab_data['merges'].items()}
return tokenizer
except Exception as e:
raise Exception(f"Error loading tokenizer: {str(e)}")
def encode_text(text, tokenizer):
"""Encode text and return statistics"""
if not text.strip():
return ("Please enter some Telugu text",
"No statistics available",
[])
try:
# Encode the text
encoded = tokenizer.encode(text)
# Calculate compression ratio
original_size = len(text.encode('utf-8'))
encoded_size = len(encoded) * 2
compression_ratio = original_size / encoded_size
# Prepare statistics
stats = f"""
📊 Encoding Statistics:
• Original text length: {len(text)} characters
• Encoded length: {len(encoded)} tokens
• Compression ratio: {compression_ratio:.2f}X
• Original size: {original_size} bytes
• Encoded size: {encoded_size} bytes
• Space saved: {(1 - encoded_size/original_size) * 100:.1f}%
"""
# Create visualization data
tokens = []
# Generate colors based on token frequencies
unique_tokens = set(encoded)
# Create color map with string hex colors
color_map = {token: f"#{hash(str(token)) % 0xFFFFFF:06x}" for token in unique_tokens}
# Create visualization list with proper format
visualization = []
for token_id in encoded:
token_bytes = tokenizer.vocab[token_id]
token_text = token_bytes.decode('utf-8', errors='replace')
visualization.append((token_text, color_map[token_id]))
return (
str(encoded),
stats,
visualization
)
except Exception as e:
return (
f"Error: {str(e)}",
"Error occurred during encoding",
[]
)
def decode_ids(encoded_ids_str):
"""Decode the encoded IDs back to text"""
if not encoded_ids_str.strip():
return "Please enter encoded IDs"
try:
# Convert string representation of list to actual list of integers
encoded_ids = eval(encoded_ids_str)
if not isinstance(encoded_ids, list):
return "Invalid input: Please enter a list of integers"
# Decode the IDs
decoded_text = tokenizer.decode(encoded_ids)
return decoded_text
except Exception as e:
return f"Error during decoding: {str(e)}"
# Load the tokenizer
try:
model_path = os.path.join(current_dir, "models", "version_2", "checkpoints", "telugu_basic.model")
vocab_path = os.path.join(current_dir, "models", "version_2", "vocabulary", "vocabulary.json")
print(f"Loading model from: {model_path}")
print(f"Loading vocabulary from: {vocab_path}")
tokenizer = load_tokenizer(model_path, vocab_path)
print("Tokenizer loaded successfully")
except Exception as e:
print(f"Error loading tokenizer: {str(e)}")
raise
# Example inputs
encoder_examples = [
["తెలుగు భాష చాలా అందమైనది", "Basic sentence example"],
["నేను తెలుగు నేర్చుకుంటున్నాను", "Learning Telugu example"],
["ప్రతి ఒక్కరూ సంతోషంగా ఉండాలి", "Happiness wish example"],
["అరణ్యంలో రాముడు అనేక రాక్షసులను సంహరిస్తాడు", "Complex sentence example"],
["తెలుగు సాహిత్యం చాలా సమృద్ధిగా ఉంది", "Literature example"]
]
decoder_examples = [
["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260]", "Basic sentence decoding"],
["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260, 287, 2206]", "Multiple tokens decoding"],
]
# Create the Gradio interface
with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 🔤 Telugu Text Tokenizer
This tool helps you encode Telugu text into tokens and decode them back.
It uses a trained BPE (Byte Pair Encoding) tokenizer optimized for Telugu language.
## Features:
- 🔄 Encode Telugu text to token IDs
- 📊 View compression statistics
- 🎨 Visualize token segmentation
- ⚡ Fast and efficient encoding/decoding
""")
with gr.Tab("Encoder"):
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Enter Telugu Text",
placeholder="Type or paste Telugu text here...",
lines=5,
interactive=True
)
encode_btn = gr.Button("🔄 Encode", variant="primary")
with gr.Column():
with gr.Row():
encoded_output = gr.Textbox(
label="Encoded Token IDs",
lines=5,
interactive=False,
show_copy_button=True
)
stats_output = gr.Textbox(
label="Statistics",
lines=8,
interactive=False
)
with gr.Row():
token_viz = gr.HighlightedText(
label="Token Segmentation",
show_legend=True,
combine_adjacent=True,
color_map={}
)
# Encoder button click event
encode_btn.click(
fn=lambda text: encode_text(text, tokenizer),
inputs=[input_text],
outputs=[encoded_output, stats_output, token_viz]
)
# Examples for encoder
gr.Examples(
examples=encoder_examples,
inputs=input_text,
outputs=[encoded_output, stats_output, token_viz],
fn=lambda x: encode_text(x, tokenizer),
cache_examples=True,
label="Telugu Text Examples"
)
with gr.Tab("Decoder"):
with gr.Row():
with gr.Column():
encoded_input = gr.Textbox(
label="Enter Encoded Token IDs",
placeholder="Paste the encoded token IDs here...",
lines=5,
interactive=True
)
decode_btn = gr.Button("🔄 Decode", variant="primary")
with gr.Column():
decoded_output = gr.Textbox(
label="Decoded Telugu Text",
lines=5,
interactive=False
)
# Decoder button click event
decode_btn.click(
fn=decode_ids,
inputs=[encoded_input],
outputs=[decoded_output]
)
# Examples for decoder
gr.Examples(
examples=decoder_examples,
inputs=encoded_input,
outputs=decoded_output,
fn=decode_ids,
cache_examples=True,
label="Token ID Examples"
)
gr.Markdown("""
### 📝 Instructions:
1. **Encoding**:
- Enter Telugu text in the encoder tab
- Click "Encode" to get token IDs and statistics
- Try the examples below to see how different texts are encoded
2. **Decoding**:
- Copy the encoded IDs from the encoder output
- Paste them in the decoder tab
- Click "Decode" to get back the original text
- Try the example token IDs to see how decoding works
3. **Visualization**:
- Each token is highlighted with a unique color
- Same tokens will have the same color
- Hover over tokens to see their IDs
### 🎯 Example Usage:
- Try encoding "తెలుగు" to see how basic words are tokenized
- Use longer sentences to see compression in action
- Copy encoded IDs and decode them back to verify accuracy
### ℹ️ Notes:
- The tokenizer uses BPE (Byte Pair Encoding) algorithm
- Compression ratio shows how efficiently the text is encoded
- Different colors in visualization represent different tokens
- Typical compression ratios range from 3x to 4x
""")
gr.Markdown("""
---
### 📌 Version Information
- Model Version: 2.0
- Vocabulary Size: 4800 tokens
- Last Updated: 2024
""")
# Launch the app
if __name__ == "__main__":
demo.launch(
share=True,
debug=True,
server_name="0.0.0.0",
server_port=7860,
show_error=True
) |