import gradio as gr
import json
import os
import sys
import numpy as np

# Add the current directory to Python path
current_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(current_dir)

from tokenizers.basic import BasicTokenizer

def load_tokenizer(model_path, vocab_path):
    """Load the trained tokenizer"""
    tokenizer = BasicTokenizer()
    try:
        # Check if paths exist
        if not os.path.exists(model_path):
            raise FileNotFoundError(f"Model file not found at: {model_path}")
        if not os.path.exists(vocab_path):
            raise FileNotFoundError(f"Vocabulary file not found at: {vocab_path}")
            
        # Load the trained model
        tokenizer.load(model_path)
        
        # Load vocabulary
        with open(vocab_path, 'r', encoding='utf-8') as f:
            vocab_data = json.load(f)
            tokenizer.token_to_id = {k: int(v) for k, v in vocab_data['token_to_id'].items()}
            tokenizer.id_to_token = {int(k): v for k, v in vocab_data['id_to_token'].items()}
            tokenizer.merges = {tuple(map(int, k.split(','))): int(v) 
                              for k, v in vocab_data['merges'].items()}
        return tokenizer
    except Exception as e:
        raise Exception(f"Error loading tokenizer: {str(e)}")

def encode_text(text, tokenizer):
    """Encode text and return statistics"""
    if not text.strip():
        return ("Please enter some Telugu text", 
                "No statistics available",
                [])
    
    try:
        # Encode the text
        encoded = tokenizer.encode(text)
        
        # Calculate compression ratio
        original_size = len(text.encode('utf-8'))
        encoded_size = len(encoded) * 2
        compression_ratio = original_size / encoded_size
        
        # Prepare statistics
        stats = f"""
        📊 Encoding Statistics:
        • Original text length: {len(text)} characters
        • Encoded length: {len(encoded)} tokens
        • Compression ratio: {compression_ratio:.2f}X
        • Original size: {original_size} bytes
        • Encoded size: {encoded_size} bytes
        • Space saved: {(1 - encoded_size/original_size) * 100:.1f}%
        """
        
        # Create visualization data
        tokens = []
        
        # Generate colors based on token frequencies
        unique_tokens = set(encoded)
        # Create color map with string hex colors
        color_map = {token: f"#{hash(str(token)) % 0xFFFFFF:06x}" for token in unique_tokens}
        
        # Create visualization list with proper format
        visualization = []
        for token_id in encoded:
            token_bytes = tokenizer.vocab[token_id]
            token_text = token_bytes.decode('utf-8', errors='replace')
            visualization.append((token_text, color_map[token_id]))
        
        return (
            str(encoded),
            stats,
            visualization
        )
        
    except Exception as e:
        return (
            f"Error: {str(e)}",
            "Error occurred during encoding",
            []
        )

def decode_ids(encoded_ids_str):
    """Decode the encoded IDs back to text"""
    if not encoded_ids_str.strip():
        return "Please enter encoded IDs"
    
    try:
        # Convert string representation of list to actual list of integers
        encoded_ids = eval(encoded_ids_str)
        if not isinstance(encoded_ids, list):
            return "Invalid input: Please enter a list of integers"
        
        # Decode the IDs
        decoded_text = tokenizer.decode(encoded_ids)
        return decoded_text
    except Exception as e:
        return f"Error during decoding: {str(e)}"

# Load the tokenizer
try:
    model_path = os.path.join(current_dir, "models", "version_2", "checkpoints", "telugu_basic.model")
    vocab_path = os.path.join(current_dir, "models", "version_2", "vocabulary", "vocabulary.json")
    
    print(f"Loading model from: {model_path}")
    print(f"Loading vocabulary from: {vocab_path}")
    
    tokenizer = load_tokenizer(model_path, vocab_path)
    print("Tokenizer loaded successfully")
except Exception as e:
    print(f"Error loading tokenizer: {str(e)}")
    raise

# Example inputs
encoder_examples = [
    ["తెలుగు భాష చాలా అందమైనది", "Basic sentence example"],
    ["నేను తెలుగు నేర్చుకుంటున్నాను", "Learning Telugu example"],
    ["ప్రతి ఒక్కరూ సంతోషంగా ఉండాలి", "Happiness wish example"],
    ["అరణ్యంలో రాముడు అనేక రాక్షసులను సంహరిస్తాడు", "Complex sentence example"],
    ["తెలుగు సాహిత్యం చాలా సమృద్ధిగా ఉంది", "Literature example"]
]

decoder_examples = [
    ["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260]", "Basic sentence decoding"],
    ["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260, 287, 2206]", "Multiple tokens decoding"],
]

# Create the Gradio interface
with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🔤 Telugu Text Tokenizer
    
    This tool helps you encode Telugu text into tokens and decode them back. 
    It uses a trained BPE (Byte Pair Encoding) tokenizer optimized for Telugu language.
    
    ## Features:
    - 🔄 Encode Telugu text to token IDs
    - 📊 View compression statistics
    - 🎨 Visualize token segmentation
    - ⚡ Fast and efficient encoding/decoding
    """)
    
    with gr.Tab("Encoder"):
        with gr.Row():
            with gr.Column():
                input_text = gr.Textbox(
                    label="Enter Telugu Text",
                    placeholder="Type or paste Telugu text here...",
                    lines=5,
                    interactive=True
                )
                encode_btn = gr.Button("🔄 Encode", variant="primary")
            
            with gr.Column():
                with gr.Row():
                    encoded_output = gr.Textbox(
                        label="Encoded Token IDs",
                        lines=5,
                        interactive=False,
                        show_copy_button=True
                    )
                stats_output = gr.Textbox(
                    label="Statistics",
                    lines=8,
                    interactive=False
                )
        
        with gr.Row():
            token_viz = gr.HighlightedText(
                label="Token Segmentation",
                show_legend=True,
                combine_adjacent=True,
                color_map={}
            )
        
        # Encoder button click event
        encode_btn.click(
            fn=lambda text: encode_text(text, tokenizer),
            inputs=[input_text],
            outputs=[encoded_output, stats_output, token_viz]
        )
            
        # Examples for encoder
        gr.Examples(
            examples=encoder_examples,
            inputs=input_text,
            outputs=[encoded_output, stats_output, token_viz],
            fn=lambda x: encode_text(x, tokenizer),
            cache_examples=True,
            label="Telugu Text Examples"
        )
    
    with gr.Tab("Decoder"):
        with gr.Row():
            with gr.Column():
                encoded_input = gr.Textbox(
                    label="Enter Encoded Token IDs",
                    placeholder="Paste the encoded token IDs here...",
                    lines=5,
                    interactive=True
                )
                decode_btn = gr.Button("🔄 Decode", variant="primary")
            
            with gr.Column():
                decoded_output = gr.Textbox(
                    label="Decoded Telugu Text",
                    lines=5,
                    interactive=False
                )
        
        # Decoder button click event
        decode_btn.click(
            fn=decode_ids,
            inputs=[encoded_input],
            outputs=[decoded_output]
        )
        
        # Examples for decoder
        gr.Examples(
            examples=decoder_examples,
            inputs=encoded_input,
            outputs=decoded_output,
            fn=decode_ids,
            cache_examples=True,
            label="Token ID Examples"
        )
    
    gr.Markdown("""
    ### 📝 Instructions:
    1. **Encoding**: 
       - Enter Telugu text in the encoder tab
       - Click "Encode" to get token IDs and statistics
       - Try the examples below to see how different texts are encoded
    
    2. **Decoding**: 
       - Copy the encoded IDs from the encoder output
       - Paste them in the decoder tab
       - Click "Decode" to get back the original text
       - Try the example token IDs to see how decoding works
    
    3. **Visualization**: 
       - Each token is highlighted with a unique color
       - Same tokens will have the same color
       - Hover over tokens to see their IDs
    
    ### 🎯 Example Usage:
    - Try encoding "తెలుగు" to see how basic words are tokenized
    - Use longer sentences to see compression in action
    - Copy encoded IDs and decode them back to verify accuracy
    
    ### ℹ️ Notes:
    - The tokenizer uses BPE (Byte Pair Encoding) algorithm
    - Compression ratio shows how efficiently the text is encoded
    - Different colors in visualization represent different tokens
    - Typical compression ratios range from 3x to 4x
    """)

    gr.Markdown("""
    ---
    ### 📌 Version Information
    - Model Version: 2.0
    - Vocabulary Size: 4800 tokens
    - Last Updated: 2024
    """)

# Launch the app
if __name__ == "__main__":
    demo.launch(
        share=True,
        debug=True,
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True
    )