kishkath commited on
Commit
f53ddeb
·
verified ·
1 Parent(s): 5740893

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -19
app.py CHANGED
@@ -37,11 +37,9 @@ def load_tokenizer(model_path, vocab_path):
37
  def encode_text(text, tokenizer):
38
  """Encode text and return statistics"""
39
  if not text.strip():
40
- return {
41
- "encoded_ids": "Please enter some Telugu text",
42
- "stats": "No statistics available",
43
- "visualization": None
44
- }
45
 
46
  try:
47
  # Encode the text
@@ -63,20 +61,35 @@ def encode_text(text, tokenizer):
63
  • Space saved: {(1 - encoded_size/original_size) * 100:.1f}%
64
  """
65
 
66
- # Create token visualization
67
- viz_data = visualize_encoding(text, encoded, tokenizer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
- return {
70
- "encoded_ids": str(encoded),
71
- "stats": stats,
72
- "visualization": viz_data
73
- }
74
  except Exception as e:
75
- return {
76
- "encoded_ids": f"Error: {str(e)}",
77
- "stats": "Error occurred during encoding",
78
- "visualization": None
79
- }
80
 
81
  def decode_ids(encoded_ids_str, tokenizer):
82
  """Decode the encoded IDs back to text"""
@@ -192,8 +205,8 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
192
 
193
  # Set up event handlers
194
  encode_btn.click(
195
- fn=lambda text: encode_text(text, tokenizer),
196
- inputs=input_text,
197
  outputs=[encoded_output, stats_output, token_viz]
198
  )
199
 
 
37
  def encode_text(text, tokenizer):
38
  """Encode text and return statistics"""
39
  if not text.strip():
40
+ return ("Please enter some Telugu text",
41
+ "No statistics available",
42
+ None)
 
 
43
 
44
  try:
45
  # Encode the text
 
61
  • Space saved: {(1 - encoded_size/original_size) * 100:.1f}%
62
  """
63
 
64
+ # Create visualization data
65
+ tokens = []
66
+ colors = []
67
+
68
+ # Generate colors based on token frequencies
69
+ unique_tokens = set(encoded)
70
+ color_map = {token: np.random.rand(3).tolist() for token in unique_tokens}
71
+
72
+ for token_id in encoded:
73
+ token_bytes = tokenizer.vocab[token_id]
74
+ token_text = token_bytes.decode('utf-8', errors='replace')
75
+ tokens.append(token_text)
76
+ colors.append(color_map[token_id])
77
+
78
+ # Create visualization list for HighlightedText
79
+ visualization = [(token, color) for token, color in zip(tokens, colors)]
80
+
81
+ return (
82
+ str(encoded), # encoded_ids for the first textbox
83
+ stats, # stats for the second textbox
84
+ visualization # for the HighlightedText component
85
+ )
86
 
 
 
 
 
 
87
  except Exception as e:
88
+ return (
89
+ f"Error: {str(e)}",
90
+ "Error occurred during encoding",
91
+ None
92
+ )
93
 
94
  def decode_ids(encoded_ids_str, tokenizer):
95
  """Decode the encoded IDs back to text"""
 
205
 
206
  # Set up event handlers
207
  encode_btn.click(
208
+ fn=encode_text, # Now using the function directly
209
+ inputs=[input_text, gr.State(tokenizer)], # Pass tokenizer as state
210
  outputs=[encoded_output, stats_output, token_viz]
211
  )
212