kishkath commited on
Commit
b115eae
·
verified ·
1 Parent(s): f53ddeb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -13
app.py CHANGED
@@ -39,7 +39,7 @@ def encode_text(text, tokenizer):
39
  if not text.strip():
40
  return ("Please enter some Telugu text",
41
  "No statistics available",
42
- None)
43
 
44
  try:
45
  # Encode the text
@@ -63,32 +63,30 @@ def encode_text(text, tokenizer):
63
 
64
  # Create visualization data
65
  tokens = []
66
- colors = []
67
 
68
  # Generate colors based on token frequencies
69
  unique_tokens = set(encoded)
70
- color_map = {token: np.random.rand(3).tolist() for token in unique_tokens}
 
71
 
 
 
72
  for token_id in encoded:
73
  token_bytes = tokenizer.vocab[token_id]
74
  token_text = token_bytes.decode('utf-8', errors='replace')
75
- tokens.append(token_text)
76
- colors.append(color_map[token_id])
77
-
78
- # Create visualization list for HighlightedText
79
- visualization = [(token, color) for token, color in zip(tokens, colors)]
80
 
81
  return (
82
- str(encoded), # encoded_ids for the first textbox
83
- stats, # stats for the second textbox
84
- visualization # for the HighlightedText component
85
  )
86
 
87
  except Exception as e:
88
  return (
89
  f"Error: {str(e)}",
90
  "Error occurred during encoding",
91
- None
92
  )
93
 
94
  def decode_ids(encoded_ids_str, tokenizer):
@@ -183,7 +181,9 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
183
  gr.Markdown("### Token Visualization")
184
  token_viz = gr.HighlightedText(
185
  label="Token Segmentation",
186
- show_legend=True
 
 
187
  )
188
 
189
  with gr.Tab("Decoder"):
 
39
  if not text.strip():
40
  return ("Please enter some Telugu text",
41
  "No statistics available",
42
+ []) # Empty list for visualization
43
 
44
  try:
45
  # Encode the text
 
63
 
64
  # Create visualization data
65
  tokens = []
 
66
 
67
  # Generate colors based on token frequencies
68
  unique_tokens = set(encoded)
69
+ # Create color map with string hex colors instead of RGB lists
70
+ color_map = {token: f"#{hash(str(token)) % 0xFFFFFF:06x}" for token in unique_tokens}
71
 
72
+ # Create visualization list with proper format
73
+ visualization = []
74
  for token_id in encoded:
75
  token_bytes = tokenizer.vocab[token_id]
76
  token_text = token_bytes.decode('utf-8', errors='replace')
77
+ visualization.append((token_text, color_map[token_id]))
 
 
 
 
78
 
79
  return (
80
+ str(encoded), # encoded_ids for the first textbox
81
+ stats, # stats for the second textbox
82
+ visualization # for the HighlightedText component
83
  )
84
 
85
  except Exception as e:
86
  return (
87
  f"Error: {str(e)}",
88
  "Error occurred during encoding",
89
+ [] # Empty list for visualization on error
90
  )
91
 
92
  def decode_ids(encoded_ids_str, tokenizer):
 
181
  gr.Markdown("### Token Visualization")
182
  token_viz = gr.HighlightedText(
183
  label="Token Segmentation",
184
+ show_legend=True,
185
+ combine_adjacent=True,
186
+ color_map={} # Let Gradio handle the color mapping
187
  )
188
 
189
  with gr.Tab("Decoder"):