Spaces:

kishkath
/

bpe-tokenizer

Sleeping

App Files Files Community

kishkath commited on Jan 15

Commit

2eb6bed

verified ·

1 Parent(s): cfb25fd

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -8

app.py CHANGED Viewed

@@ -1,13 +1,25 @@
 import gradio as gr
 import json
 import os
-from tokenizers.basic import BasicTokenizer
 import numpy as np
 def load_tokenizer(model_path, vocab_path):
     """Load the trained tokenizer"""
     tokenizer = BasicTokenizer()
     try:
         # Load the trained model
         tokenizer.load(model_path)
@@ -20,7 +32,7 @@ def load_tokenizer(model_path, vocab_path):
                               for k, v in vocab_data['merges'].items()}
         return tokenizer
     except Exception as e:
-        raise Exception(f"Error loading tokenizer: {e}")
 def encode_text(text, tokenizer):
     """Encode text and return statistics"""
@@ -103,10 +115,19 @@ def visualize_encoding(text, encoded_ids, tokenizer):
         "colors": colors
     }
-# Load the tokenizer
-model_path = "models/version_2/checkpoints/telugu_basic.model"
-vocab_path = "models/version_2/vocabulary/vocabulary.json"
-tokenizer = load_tokenizer(model_path, vocab_path)
 # Create the Gradio interface
 with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
@@ -194,6 +215,12 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
     - Different colors in visualization represent different tokens
     """)
-# Launch the app
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 import json
 import os
+import sys
 import numpy as np
+# Add the current directory to Python path
+current_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(current_dir)
+from tokenizers.basic import BasicTokenizer
 def load_tokenizer(model_path, vocab_path):
     """Load the trained tokenizer"""
     tokenizer = BasicTokenizer()
     try:
+        # Check if paths exist
+        if not os.path.exists(model_path):
+            raise FileNotFoundError(f"Model file not found at: {model_path}")
+        if not os.path.exists(vocab_path):
+            raise FileNotFoundError(f"Vocabulary file not found at: {vocab_path}")
         # Load the trained model
         tokenizer.load(model_path)
                               for k, v in vocab_data['merges'].items()}
         return tokenizer
     except Exception as e:
+        raise Exception(f"Error loading tokenizer: {str(e)}")
 def encode_text(text, tokenizer):
     """Encode text and return statistics"""
         "colors": colors
     }
+# Load the tokenizer with proper path handling
+try:
+    model_path = os.path.join(current_dir, "models", "version_2", "checkpoints", "telugu_basic.model")
+    vocab_path = os.path.join(current_dir, "models", "version_2", "vocabulary", "vocabulary.json")
+    print(f"Loading model from: {model_path}")
+    print(f"Loading vocabulary from: {vocab_path}")
+    tokenizer = load_tokenizer(model_path, vocab_path)
+    print("Tokenizer loaded successfully")
+except Exception as e:
+    print(f"Error loading tokenizer: {str(e)}")
+    raise
 # Create the Gradio interface
 with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
     - Different colors in visualization represent different tokens
     """)
+# Launch the app with additional configurations
 if __name__ == "__main__":
+    demo.launch(
+        share=True,
+        debug=True,
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True
+    )