import gradio as gr import json import os import sys import numpy as np # Add the current directory to Python path current_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append(current_dir) from tokenizers.basic import BasicTokenizer def load_tokenizer(model_path, vocab_path): """Load the trained tokenizer""" tokenizer = BasicTokenizer() try: # Check if paths exist if not os.path.exists(model_path): raise FileNotFoundError(f"Model file not found at: {model_path}") if not os.path.exists(vocab_path): raise FileNotFoundError(f"Vocabulary file not found at: {vocab_path}") # Load the trained model tokenizer.load(model_path) # Load vocabulary with open(vocab_path, 'r', encoding='utf-8') as f: vocab_data = json.load(f) tokenizer.token_to_id = {k: int(v) for k, v in vocab_data['token_to_id'].items()} tokenizer.id_to_token = {int(k): v for k, v in vocab_data['id_to_token'].items()} tokenizer.merges = {tuple(map(int, k.split(','))): int(v) for k, v in vocab_data['merges'].items()} return tokenizer except Exception as e: raise Exception(f"Error loading tokenizer: {str(e)}") def encode_text(text, tokenizer): """Encode text and return statistics""" if not text.strip(): return ("Please enter some Telugu text", "No statistics available", []) try: # Encode the text encoded = tokenizer.encode(text) # Calculate compression ratio original_size = len(text.encode('utf-8')) encoded_size = len(encoded) * 2 compression_ratio = original_size / encoded_size # Prepare statistics stats = f""" 📊 Encoding Statistics: • Original text length: {len(text)} characters • Encoded length: {len(encoded)} tokens • Compression ratio: {compression_ratio:.2f}X • Original size: {original_size} bytes • Encoded size: {encoded_size} bytes • Space saved: {(1 - encoded_size/original_size) * 100:.1f}% """ # Create visualization data tokens = [] # Generate colors based on token frequencies unique_tokens = set(encoded) # Create color map with string hex colors color_map = {token: f"#{hash(str(token)) % 0xFFFFFF:06x}" for token in unique_tokens} # Create visualization list with proper format visualization = [] for token_id in encoded: token_bytes = tokenizer.vocab[token_id] token_text = token_bytes.decode('utf-8', errors='replace') visualization.append((token_text, color_map[token_id])) return ( str(encoded), stats, visualization ) except Exception as e: return ( f"Error: {str(e)}", "Error occurred during encoding", [] ) def decode_ids(encoded_ids_str): """Decode the encoded IDs back to text""" if not encoded_ids_str.strip(): return "Please enter encoded IDs" try: # Convert string representation of list to actual list of integers encoded_ids = eval(encoded_ids_str) if not isinstance(encoded_ids, list): return "Invalid input: Please enter a list of integers" # Decode the IDs decoded_text = tokenizer.decode(encoded_ids) return decoded_text except Exception as e: return f"Error during decoding: {str(e)}" # Load the tokenizer try: model_path = os.path.join(current_dir, "models", "version_2", "checkpoints", "telugu_basic.model") vocab_path = os.path.join(current_dir, "models", "version_2", "vocabulary", "vocabulary.json") print(f"Loading model from: {model_path}") print(f"Loading vocabulary from: {vocab_path}") tokenizer = load_tokenizer(model_path, vocab_path) print("Tokenizer loaded successfully") except Exception as e: print(f"Error loading tokenizer: {str(e)}") raise # Example inputs encoder_examples = [ ["తెలుగు భాష చాలా అందమైనది", "Basic sentence example"], ["నేను తెలుగు నేర్చుకుంటున్నాను", "Learning Telugu example"], ["ప్రతి ఒక్కరూ సంతోషంగా ఉండాలి", "Happiness wish example"], ["అరణ్యంలో రాముడు అనేక రాక్షసులను సంహరిస్తాడు", "Complex sentence example"], ["తెలుగు సాహిత్యం చాలా సమృద్ధిగా ఉంది", "Literature example"] ] decoder_examples = [ ["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260]", "Basic sentence decoding"], ["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260, 287, 2206]", "Multiple tokens decoding"], ] # Create the Gradio interface with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🔤 Telugu Text Tokenizer This tool helps you encode Telugu text into tokens and decode them back. It uses a trained BPE (Byte Pair Encoding) tokenizer optimized for Telugu language. ## Features: - 🔄 Encode Telugu text to token IDs - 📊 View compression statistics - 🎨 Visualize token segmentation - ⚡ Fast and efficient encoding/decoding """) with gr.Tab("Encoder"): with gr.Row(): with gr.Column(): input_text = gr.Textbox( label="Enter Telugu Text", placeholder="Type or paste Telugu text here...", lines=5, interactive=True ) encode_btn = gr.Button("🔄 Encode", variant="primary") with gr.Column(): with gr.Row(): encoded_output = gr.Textbox( label="Encoded Token IDs", lines=5, interactive=False, show_copy_button=True ) stats_output = gr.Textbox( label="Statistics", lines=8, interactive=False ) with gr.Row(): token_viz = gr.HighlightedText( label="Token Segmentation", show_legend=True, combine_adjacent=True, color_map={} ) # Encoder button click event encode_btn.click( fn=lambda text: encode_text(text, tokenizer), inputs=[input_text], outputs=[encoded_output, stats_output, token_viz] ) # Examples for encoder gr.Examples( examples=encoder_examples, inputs=input_text, outputs=[encoded_output, stats_output, token_viz], fn=lambda x: encode_text(x, tokenizer), cache_examples=True, label="Telugu Text Examples" ) with gr.Tab("Decoder"): with gr.Row(): with gr.Column(): encoded_input = gr.Textbox( label="Enter Encoded Token IDs", placeholder="Paste the encoded token IDs here...", lines=5, interactive=True ) decode_btn = gr.Button("🔄 Decode", variant="primary") with gr.Column(): decoded_output = gr.Textbox( label="Decoded Telugu Text", lines=5, interactive=False ) # Decoder button click event decode_btn.click( fn=decode_ids, inputs=[encoded_input], outputs=[decoded_output] ) # Examples for decoder gr.Examples( examples=decoder_examples, inputs=encoded_input, outputs=decoded_output, fn=decode_ids, cache_examples=True, label="Token ID Examples" ) gr.Markdown(""" ### 📝 Instructions: 1. **Encoding**: - Enter Telugu text in the encoder tab - Click "Encode" to get token IDs and statistics - Try the examples below to see how different texts are encoded 2. **Decoding**: - Copy the encoded IDs from the encoder output - Paste them in the decoder tab - Click "Decode" to get back the original text - Try the example token IDs to see how decoding works 3. **Visualization**: - Each token is highlighted with a unique color - Same tokens will have the same color - Hover over tokens to see their IDs ### 🎯 Example Usage: - Try encoding "తెలుగు" to see how basic words are tokenized - Use longer sentences to see compression in action - Copy encoded IDs and decode them back to verify accuracy ### ℹ️ Notes: - The tokenizer uses BPE (Byte Pair Encoding) algorithm - Compression ratio shows how efficiently the text is encoded - Different colors in visualization represent different tokens - Typical compression ratios range from 3x to 4x """) gr.Markdown(""" --- ### 📌 Version Information - Model Version: 2.0 - Vocabulary Size: 4800 tokens - Last Updated: 2024 """) # Launch the app if __name__ == "__main__": demo.launch( share=True, debug=True, server_name="0.0.0.0", server_port=7860, show_error=True )