Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,13 +1,25 @@
|
|
1 |
import gradio as gr
|
2 |
import json
|
3 |
import os
|
4 |
-
|
5 |
import numpy as np
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
def load_tokenizer(model_path, vocab_path):
|
8 |
"""Load the trained tokenizer"""
|
9 |
tokenizer = BasicTokenizer()
|
10 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
# Load the trained model
|
12 |
tokenizer.load(model_path)
|
13 |
|
@@ -20,7 +32,7 @@ def load_tokenizer(model_path, vocab_path):
|
|
20 |
for k, v in vocab_data['merges'].items()}
|
21 |
return tokenizer
|
22 |
except Exception as e:
|
23 |
-
raise Exception(f"Error loading tokenizer: {e}")
|
24 |
|
25 |
def encode_text(text, tokenizer):
|
26 |
"""Encode text and return statistics"""
|
@@ -103,10 +115,19 @@ def visualize_encoding(text, encoded_ids, tokenizer):
|
|
103 |
"colors": colors
|
104 |
}
|
105 |
|
106 |
-
# Load the tokenizer
|
107 |
-
|
108 |
-
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
# Create the Gradio interface
|
112 |
with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
|
@@ -194,6 +215,12 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
|
|
194 |
- Different colors in visualization represent different tokens
|
195 |
""")
|
196 |
|
197 |
-
# Launch the app
|
198 |
if __name__ == "__main__":
|
199 |
-
demo.launch(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import json
|
3 |
import os
|
4 |
+
import sys
|
5 |
import numpy as np
|
6 |
|
7 |
+
# Add the current directory to Python path
|
8 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
9 |
+
sys.path.append(current_dir)
|
10 |
+
|
11 |
+
from tokenizers.basic import BasicTokenizer
|
12 |
+
|
13 |
def load_tokenizer(model_path, vocab_path):
|
14 |
"""Load the trained tokenizer"""
|
15 |
tokenizer = BasicTokenizer()
|
16 |
try:
|
17 |
+
# Check if paths exist
|
18 |
+
if not os.path.exists(model_path):
|
19 |
+
raise FileNotFoundError(f"Model file not found at: {model_path}")
|
20 |
+
if not os.path.exists(vocab_path):
|
21 |
+
raise FileNotFoundError(f"Vocabulary file not found at: {vocab_path}")
|
22 |
+
|
23 |
# Load the trained model
|
24 |
tokenizer.load(model_path)
|
25 |
|
|
|
32 |
for k, v in vocab_data['merges'].items()}
|
33 |
return tokenizer
|
34 |
except Exception as e:
|
35 |
+
raise Exception(f"Error loading tokenizer: {str(e)}")
|
36 |
|
37 |
def encode_text(text, tokenizer):
|
38 |
"""Encode text and return statistics"""
|
|
|
115 |
"colors": colors
|
116 |
}
|
117 |
|
118 |
+
# Load the tokenizer with proper path handling
|
119 |
+
try:
|
120 |
+
model_path = os.path.join(current_dir, "models", "version_2", "checkpoints", "telugu_basic.model")
|
121 |
+
vocab_path = os.path.join(current_dir, "models", "version_2", "vocabulary", "vocabulary.json")
|
122 |
+
|
123 |
+
print(f"Loading model from: {model_path}")
|
124 |
+
print(f"Loading vocabulary from: {vocab_path}")
|
125 |
+
|
126 |
+
tokenizer = load_tokenizer(model_path, vocab_path)
|
127 |
+
print("Tokenizer loaded successfully")
|
128 |
+
except Exception as e:
|
129 |
+
print(f"Error loading tokenizer: {str(e)}")
|
130 |
+
raise
|
131 |
|
132 |
# Create the Gradio interface
|
133 |
with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
|
|
|
215 |
- Different colors in visualization represent different tokens
|
216 |
""")
|
217 |
|
218 |
+
# Launch the app with additional configurations
|
219 |
if __name__ == "__main__":
|
220 |
+
demo.launch(
|
221 |
+
share=True,
|
222 |
+
debug=True,
|
223 |
+
server_name="0.0.0.0",
|
224 |
+
server_port=7860,
|
225 |
+
show_error=True
|
226 |
+
)
|