kishkath commited on
Commit
d6e7dc9
·
verified ·
1 Parent(s): dcb5a2c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -39
app.py CHANGED
@@ -39,7 +39,7 @@ def encode_text(text, tokenizer):
39
  if not text.strip():
40
  return ("Please enter some Telugu text",
41
  "No statistics available",
42
- []) # Empty list for visualization
43
 
44
  try:
45
  # Encode the text
@@ -66,7 +66,7 @@ def encode_text(text, tokenizer):
66
 
67
  # Generate colors based on token frequencies
68
  unique_tokens = set(encoded)
69
- # Create color map with string hex colors instead of RGB lists
70
  color_map = {token: f"#{hash(str(token)) % 0xFFFFFF:06x}" for token in unique_tokens}
71
 
72
  # Create visualization list with proper format
@@ -77,19 +77,19 @@ def encode_text(text, tokenizer):
77
  visualization.append((token_text, color_map[token_id]))
78
 
79
  return (
80
- str(encoded), # encoded_ids for the first textbox
81
- stats, # stats for the second textbox
82
- visualization # for the HighlightedText component
83
  )
84
 
85
  except Exception as e:
86
  return (
87
  f"Error: {str(e)}",
88
  "Error occurred during encoding",
89
- [] # Empty list for visualization on error
90
  )
91
 
92
- def decode_ids(encoded_ids_str, tokenizer):
93
  """Decode the encoded IDs back to text"""
94
  if not encoded_ids_str.strip():
95
  return "Please enter encoded IDs"
@@ -106,27 +106,7 @@ def decode_ids(encoded_ids_str, tokenizer):
106
  except Exception as e:
107
  return f"Error during decoding: {str(e)}"
108
 
109
- def visualize_encoding(text, encoded_ids, tokenizer):
110
- """Create a visual representation of the encoding"""
111
- tokens = []
112
- colors = []
113
-
114
- # Generate colors based on token frequencies
115
- unique_tokens = set(encoded_ids)
116
- color_map = {token: np.random.rand(3).tolist() for token in unique_tokens}
117
-
118
- for token_id in encoded_ids:
119
- token_bytes = tokenizer.vocab[token_id]
120
- token_text = token_bytes.decode('utf-8', errors='replace')
121
- tokens.append(token_text)
122
- colors.append(color_map[token_id])
123
-
124
- return {
125
- "tokens": tokens,
126
- "colors": colors
127
- }
128
-
129
- # Load the tokenizer with proper path handling
130
  try:
131
  model_path = os.path.join(current_dir, "models", "version_2", "checkpoints", "telugu_basic.model")
132
  vocab_path = os.path.join(current_dir, "models", "version_2", "vocabulary", "vocabulary.json")
@@ -140,7 +120,7 @@ except Exception as e:
140
  print(f"Error loading tokenizer: {str(e)}")
141
  raise
142
 
143
- # Add example inputs for the encoder
144
  encoder_examples = [
145
  ["తెలుగు భాష చాలా అందమైనది", "Basic sentence example"],
146
  ["నేను తెలుగు నేర్చుకుంటున్నాను", "Learning Telugu example"],
@@ -149,7 +129,6 @@ encoder_examples = [
149
  ["తెలుగు సాహిత్యం చాలా సమృద్ధిగా ఉంది", "Literature example"]
150
  ]
151
 
152
- # Add example inputs for the decoder
153
  decoder_examples = [
154
  ["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260]", "Basic sentence decoding"],
155
  ["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260, 287, 2206]", "Multiple tokens decoding"],
@@ -176,7 +155,8 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
176
  input_text = gr.Textbox(
177
  label="Enter Telugu Text",
178
  placeholder="Type or paste Telugu text here...",
179
- lines=5
 
180
  )
181
  encode_btn = gr.Button("🔄 Encode", variant="primary")
182
 
@@ -193,15 +173,21 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
193
  )
194
 
195
  with gr.Row():
196
- gr.Markdown("### Token Visualization")
197
  token_viz = gr.HighlightedText(
198
  label="Token Segmentation",
199
  show_legend=True,
200
  combine_adjacent=True,
201
  color_map={}
202
  )
 
 
 
 
 
 
 
203
 
204
- # Add examples for encoder
205
  gr.Examples(
206
  examples=encoder_examples,
207
  inputs=input_text,
@@ -217,7 +203,8 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
217
  encoded_input = gr.Textbox(
218
  label="Enter Encoded Token IDs",
219
  placeholder="Paste the encoded token IDs here...",
220
- lines=5
 
221
  )
222
  decode_btn = gr.Button("🔄 Decode", variant="primary")
223
 
@@ -228,17 +215,23 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
228
  interactive=False
229
  )
230
 
231
- # Add examples for decoder
 
 
 
 
 
 
 
232
  gr.Examples(
233
  examples=decoder_examples,
234
  inputs=encoded_input,
235
  outputs=decoded_output,
236
- fn=lambda x: decode_ids(x, tokenizer),
237
  cache_examples=True,
238
  label="Token ID Examples"
239
  )
240
 
241
- # Add more detailed instructions with examples
242
  gr.Markdown("""
243
  ### 📝 Instructions:
244
  1. **Encoding**:
@@ -269,7 +262,6 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
269
  - Typical compression ratios range from 3x to 4x
270
  """)
271
 
272
- # Add a footer with version info
273
  gr.Markdown("""
274
  ---
275
  ### 📌 Version Information
@@ -278,7 +270,7 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
278
  - Last Updated: 2024
279
  """)
280
 
281
- # Launch the app with additional configurations
282
  if __name__ == "__main__":
283
  demo.launch(
284
  share=True,
 
39
  if not text.strip():
40
  return ("Please enter some Telugu text",
41
  "No statistics available",
42
+ [])
43
 
44
  try:
45
  # Encode the text
 
66
 
67
  # Generate colors based on token frequencies
68
  unique_tokens = set(encoded)
69
+ # Create color map with string hex colors
70
  color_map = {token: f"#{hash(str(token)) % 0xFFFFFF:06x}" for token in unique_tokens}
71
 
72
  # Create visualization list with proper format
 
77
  visualization.append((token_text, color_map[token_id]))
78
 
79
  return (
80
+ str(encoded),
81
+ stats,
82
+ visualization
83
  )
84
 
85
  except Exception as e:
86
  return (
87
  f"Error: {str(e)}",
88
  "Error occurred during encoding",
89
+ []
90
  )
91
 
92
+ def decode_ids(encoded_ids_str):
93
  """Decode the encoded IDs back to text"""
94
  if not encoded_ids_str.strip():
95
  return "Please enter encoded IDs"
 
106
  except Exception as e:
107
  return f"Error during decoding: {str(e)}"
108
 
109
+ # Load the tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  try:
111
  model_path = os.path.join(current_dir, "models", "version_2", "checkpoints", "telugu_basic.model")
112
  vocab_path = os.path.join(current_dir, "models", "version_2", "vocabulary", "vocabulary.json")
 
120
  print(f"Error loading tokenizer: {str(e)}")
121
  raise
122
 
123
+ # Example inputs
124
  encoder_examples = [
125
  ["తెలుగు భాష చాలా అందమైనది", "Basic sentence example"],
126
  ["నేను తెలుగు నేర్చుకుంటున్నాను", "Learning Telugu example"],
 
129
  ["తెలుగు సాహిత్యం చాలా సమృద్ధిగా ఉంది", "Literature example"]
130
  ]
131
 
 
132
  decoder_examples = [
133
  ["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260]", "Basic sentence decoding"],
134
  ["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260, 287, 2206]", "Multiple tokens decoding"],
 
155
  input_text = gr.Textbox(
156
  label="Enter Telugu Text",
157
  placeholder="Type or paste Telugu text here...",
158
+ lines=5,
159
+ interactive=True
160
  )
161
  encode_btn = gr.Button("🔄 Encode", variant="primary")
162
 
 
173
  )
174
 
175
  with gr.Row():
 
176
  token_viz = gr.HighlightedText(
177
  label="Token Segmentation",
178
  show_legend=True,
179
  combine_adjacent=True,
180
  color_map={}
181
  )
182
+
183
+ # Encoder button click event
184
+ encode_btn.click(
185
+ fn=lambda text: encode_text(text, tokenizer),
186
+ inputs=[input_text],
187
+ outputs=[encoded_output, stats_output, token_viz]
188
+ )
189
 
190
+ # Examples for encoder
191
  gr.Examples(
192
  examples=encoder_examples,
193
  inputs=input_text,
 
203
  encoded_input = gr.Textbox(
204
  label="Enter Encoded Token IDs",
205
  placeholder="Paste the encoded token IDs here...",
206
+ lines=5,
207
+ interactive=True
208
  )
209
  decode_btn = gr.Button("🔄 Decode", variant="primary")
210
 
 
215
  interactive=False
216
  )
217
 
218
+ # Decoder button click event
219
+ decode_btn.click(
220
+ fn=decode_ids,
221
+ inputs=[encoded_input],
222
+ outputs=[decoded_output]
223
+ )
224
+
225
+ # Examples for decoder
226
  gr.Examples(
227
  examples=decoder_examples,
228
  inputs=encoded_input,
229
  outputs=decoded_output,
230
+ fn=decode_ids,
231
  cache_examples=True,
232
  label="Token ID Examples"
233
  )
234
 
 
235
  gr.Markdown("""
236
  ### 📝 Instructions:
237
  1. **Encoding**:
 
262
  - Typical compression ratios range from 3x to 4x
263
  """)
264
 
 
265
  gr.Markdown("""
266
  ---
267
  ### 📌 Version Information
 
270
  - Last Updated: 2024
271
  """)
272
 
273
+ # Launch the app
274
  if __name__ == "__main__":
275
  demo.launch(
276
  share=True,