kishkath commited on
Commit
2eb6bed
·
verified ·
1 Parent(s): cfb25fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -8
app.py CHANGED
@@ -1,13 +1,25 @@
1
  import gradio as gr
2
  import json
3
  import os
4
- from tokenizers.basic import BasicTokenizer
5
  import numpy as np
6
 
 
 
 
 
 
 
7
  def load_tokenizer(model_path, vocab_path):
8
  """Load the trained tokenizer"""
9
  tokenizer = BasicTokenizer()
10
  try:
 
 
 
 
 
 
11
  # Load the trained model
12
  tokenizer.load(model_path)
13
 
@@ -20,7 +32,7 @@ def load_tokenizer(model_path, vocab_path):
20
  for k, v in vocab_data['merges'].items()}
21
  return tokenizer
22
  except Exception as e:
23
- raise Exception(f"Error loading tokenizer: {e}")
24
 
25
  def encode_text(text, tokenizer):
26
  """Encode text and return statistics"""
@@ -103,10 +115,19 @@ def visualize_encoding(text, encoded_ids, tokenizer):
103
  "colors": colors
104
  }
105
 
106
- # Load the tokenizer
107
- model_path = "models/version_2/checkpoints/telugu_basic.model"
108
- vocab_path = "models/version_2/vocabulary/vocabulary.json"
109
- tokenizer = load_tokenizer(model_path, vocab_path)
 
 
 
 
 
 
 
 
 
110
 
111
  # Create the Gradio interface
112
  with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
@@ -194,6 +215,12 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
194
  - Different colors in visualization represent different tokens
195
  """)
196
 
197
- # Launch the app
198
  if __name__ == "__main__":
199
- demo.launch()
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import json
3
  import os
4
+ import sys
5
  import numpy as np
6
 
7
+ # Add the current directory to Python path
8
+ current_dir = os.path.dirname(os.path.abspath(__file__))
9
+ sys.path.append(current_dir)
10
+
11
+ from tokenizers.basic import BasicTokenizer
12
+
13
  def load_tokenizer(model_path, vocab_path):
14
  """Load the trained tokenizer"""
15
  tokenizer = BasicTokenizer()
16
  try:
17
+ # Check if paths exist
18
+ if not os.path.exists(model_path):
19
+ raise FileNotFoundError(f"Model file not found at: {model_path}")
20
+ if not os.path.exists(vocab_path):
21
+ raise FileNotFoundError(f"Vocabulary file not found at: {vocab_path}")
22
+
23
  # Load the trained model
24
  tokenizer.load(model_path)
25
 
 
32
  for k, v in vocab_data['merges'].items()}
33
  return tokenizer
34
  except Exception as e:
35
+ raise Exception(f"Error loading tokenizer: {str(e)}")
36
 
37
  def encode_text(text, tokenizer):
38
  """Encode text and return statistics"""
 
115
  "colors": colors
116
  }
117
 
118
+ # Load the tokenizer with proper path handling
119
+ try:
120
+ model_path = os.path.join(current_dir, "models", "version_2", "checkpoints", "telugu_basic.model")
121
+ vocab_path = os.path.join(current_dir, "models", "version_2", "vocabulary", "vocabulary.json")
122
+
123
+ print(f"Loading model from: {model_path}")
124
+ print(f"Loading vocabulary from: {vocab_path}")
125
+
126
+ tokenizer = load_tokenizer(model_path, vocab_path)
127
+ print("Tokenizer loaded successfully")
128
+ except Exception as e:
129
+ print(f"Error loading tokenizer: {str(e)}")
130
+ raise
131
 
132
  # Create the Gradio interface
133
  with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
 
215
  - Different colors in visualization represent different tokens
216
  """)
217
 
218
+ # Launch the app with additional configurations
219
  if __name__ == "__main__":
220
+ demo.launch(
221
+ share=True,
222
+ debug=True,
223
+ server_name="0.0.0.0",
224
+ server_port=7860,
225
+ show_error=True
226
+ )