Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -39,7 +39,7 @@ def encode_text(text, tokenizer):
|
|
39 |
if not text.strip():
|
40 |
return ("Please enter some Telugu text",
|
41 |
"No statistics available",
|
42 |
-
[])
|
43 |
|
44 |
try:
|
45 |
# Encode the text
|
@@ -66,7 +66,7 @@ def encode_text(text, tokenizer):
|
|
66 |
|
67 |
# Generate colors based on token frequencies
|
68 |
unique_tokens = set(encoded)
|
69 |
-
# Create color map with string hex colors
|
70 |
color_map = {token: f"#{hash(str(token)) % 0xFFFFFF:06x}" for token in unique_tokens}
|
71 |
|
72 |
# Create visualization list with proper format
|
@@ -77,19 +77,19 @@ def encode_text(text, tokenizer):
|
|
77 |
visualization.append((token_text, color_map[token_id]))
|
78 |
|
79 |
return (
|
80 |
-
str(encoded),
|
81 |
-
stats,
|
82 |
-
visualization
|
83 |
)
|
84 |
|
85 |
except Exception as e:
|
86 |
return (
|
87 |
f"Error: {str(e)}",
|
88 |
"Error occurred during encoding",
|
89 |
-
[]
|
90 |
)
|
91 |
|
92 |
-
def decode_ids(encoded_ids_str
|
93 |
"""Decode the encoded IDs back to text"""
|
94 |
if not encoded_ids_str.strip():
|
95 |
return "Please enter encoded IDs"
|
@@ -106,27 +106,7 @@ def decode_ids(encoded_ids_str, tokenizer):
|
|
106 |
except Exception as e:
|
107 |
return f"Error during decoding: {str(e)}"
|
108 |
|
109 |
-
|
110 |
-
"""Create a visual representation of the encoding"""
|
111 |
-
tokens = []
|
112 |
-
colors = []
|
113 |
-
|
114 |
-
# Generate colors based on token frequencies
|
115 |
-
unique_tokens = set(encoded_ids)
|
116 |
-
color_map = {token: np.random.rand(3).tolist() for token in unique_tokens}
|
117 |
-
|
118 |
-
for token_id in encoded_ids:
|
119 |
-
token_bytes = tokenizer.vocab[token_id]
|
120 |
-
token_text = token_bytes.decode('utf-8', errors='replace')
|
121 |
-
tokens.append(token_text)
|
122 |
-
colors.append(color_map[token_id])
|
123 |
-
|
124 |
-
return {
|
125 |
-
"tokens": tokens,
|
126 |
-
"colors": colors
|
127 |
-
}
|
128 |
-
|
129 |
-
# Load the tokenizer with proper path handling
|
130 |
try:
|
131 |
model_path = os.path.join(current_dir, "models", "version_2", "checkpoints", "telugu_basic.model")
|
132 |
vocab_path = os.path.join(current_dir, "models", "version_2", "vocabulary", "vocabulary.json")
|
@@ -140,7 +120,7 @@ except Exception as e:
|
|
140 |
print(f"Error loading tokenizer: {str(e)}")
|
141 |
raise
|
142 |
|
143 |
-
#
|
144 |
encoder_examples = [
|
145 |
["తెలుగు భాష చాలా అందమైనది", "Basic sentence example"],
|
146 |
["నేను తెలుగు నేర్చుకుంటున్నాను", "Learning Telugu example"],
|
@@ -149,7 +129,6 @@ encoder_examples = [
|
|
149 |
["తెలుగు సాహిత్యం చాలా సమృద్ధిగా ఉంది", "Literature example"]
|
150 |
]
|
151 |
|
152 |
-
# Add example inputs for the decoder
|
153 |
decoder_examples = [
|
154 |
["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260]", "Basic sentence decoding"],
|
155 |
["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260, 287, 2206]", "Multiple tokens decoding"],
|
@@ -176,7 +155,8 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
|
|
176 |
input_text = gr.Textbox(
|
177 |
label="Enter Telugu Text",
|
178 |
placeholder="Type or paste Telugu text here...",
|
179 |
-
lines=5
|
|
|
180 |
)
|
181 |
encode_btn = gr.Button("🔄 Encode", variant="primary")
|
182 |
|
@@ -193,15 +173,21 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
|
|
193 |
)
|
194 |
|
195 |
with gr.Row():
|
196 |
-
gr.Markdown("### Token Visualization")
|
197 |
token_viz = gr.HighlightedText(
|
198 |
label="Token Segmentation",
|
199 |
show_legend=True,
|
200 |
combine_adjacent=True,
|
201 |
color_map={}
|
202 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
-
#
|
205 |
gr.Examples(
|
206 |
examples=encoder_examples,
|
207 |
inputs=input_text,
|
@@ -217,7 +203,8 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
|
|
217 |
encoded_input = gr.Textbox(
|
218 |
label="Enter Encoded Token IDs",
|
219 |
placeholder="Paste the encoded token IDs here...",
|
220 |
-
lines=5
|
|
|
221 |
)
|
222 |
decode_btn = gr.Button("🔄 Decode", variant="primary")
|
223 |
|
@@ -228,17 +215,23 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
|
|
228 |
interactive=False
|
229 |
)
|
230 |
|
231 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
gr.Examples(
|
233 |
examples=decoder_examples,
|
234 |
inputs=encoded_input,
|
235 |
outputs=decoded_output,
|
236 |
-
fn=
|
237 |
cache_examples=True,
|
238 |
label="Token ID Examples"
|
239 |
)
|
240 |
|
241 |
-
# Add more detailed instructions with examples
|
242 |
gr.Markdown("""
|
243 |
### 📝 Instructions:
|
244 |
1. **Encoding**:
|
@@ -269,7 +262,6 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
|
|
269 |
- Typical compression ratios range from 3x to 4x
|
270 |
""")
|
271 |
|
272 |
-
# Add a footer with version info
|
273 |
gr.Markdown("""
|
274 |
---
|
275 |
### 📌 Version Information
|
@@ -278,7 +270,7 @@ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
|
|
278 |
- Last Updated: 2024
|
279 |
""")
|
280 |
|
281 |
-
# Launch the app
|
282 |
if __name__ == "__main__":
|
283 |
demo.launch(
|
284 |
share=True,
|
|
|
39 |
if not text.strip():
|
40 |
return ("Please enter some Telugu text",
|
41 |
"No statistics available",
|
42 |
+
[])
|
43 |
|
44 |
try:
|
45 |
# Encode the text
|
|
|
66 |
|
67 |
# Generate colors based on token frequencies
|
68 |
unique_tokens = set(encoded)
|
69 |
+
# Create color map with string hex colors
|
70 |
color_map = {token: f"#{hash(str(token)) % 0xFFFFFF:06x}" for token in unique_tokens}
|
71 |
|
72 |
# Create visualization list with proper format
|
|
|
77 |
visualization.append((token_text, color_map[token_id]))
|
78 |
|
79 |
return (
|
80 |
+
str(encoded),
|
81 |
+
stats,
|
82 |
+
visualization
|
83 |
)
|
84 |
|
85 |
except Exception as e:
|
86 |
return (
|
87 |
f"Error: {str(e)}",
|
88 |
"Error occurred during encoding",
|
89 |
+
[]
|
90 |
)
|
91 |
|
92 |
+
def decode_ids(encoded_ids_str):
|
93 |
"""Decode the encoded IDs back to text"""
|
94 |
if not encoded_ids_str.strip():
|
95 |
return "Please enter encoded IDs"
|
|
|
106 |
except Exception as e:
|
107 |
return f"Error during decoding: {str(e)}"
|
108 |
|
109 |
+
# Load the tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
try:
|
111 |
model_path = os.path.join(current_dir, "models", "version_2", "checkpoints", "telugu_basic.model")
|
112 |
vocab_path = os.path.join(current_dir, "models", "version_2", "vocabulary", "vocabulary.json")
|
|
|
120 |
print(f"Error loading tokenizer: {str(e)}")
|
121 |
raise
|
122 |
|
123 |
+
# Example inputs
|
124 |
encoder_examples = [
|
125 |
["తెలుగు భాష చాలా అందమైనది", "Basic sentence example"],
|
126 |
["నేను తెలుగు నేర్చుకుంటున్నాను", "Learning Telugu example"],
|
|
|
129 |
["తెలుగు సాహిత్యం చాలా సమృద్ధిగా ఉంది", "Literature example"]
|
130 |
]
|
131 |
|
|
|
132 |
decoder_examples = [
|
133 |
["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260]", "Basic sentence decoding"],
|
134 |
["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260, 287, 2206]", "Multiple tokens decoding"],
|
|
|
155 |
input_text = gr.Textbox(
|
156 |
label="Enter Telugu Text",
|
157 |
placeholder="Type or paste Telugu text here...",
|
158 |
+
lines=5,
|
159 |
+
interactive=True
|
160 |
)
|
161 |
encode_btn = gr.Button("🔄 Encode", variant="primary")
|
162 |
|
|
|
173 |
)
|
174 |
|
175 |
with gr.Row():
|
|
|
176 |
token_viz = gr.HighlightedText(
|
177 |
label="Token Segmentation",
|
178 |
show_legend=True,
|
179 |
combine_adjacent=True,
|
180 |
color_map={}
|
181 |
)
|
182 |
+
|
183 |
+
# Encoder button click event
|
184 |
+
encode_btn.click(
|
185 |
+
fn=lambda text: encode_text(text, tokenizer),
|
186 |
+
inputs=[input_text],
|
187 |
+
outputs=[encoded_output, stats_output, token_viz]
|
188 |
+
)
|
189 |
|
190 |
+
# Examples for encoder
|
191 |
gr.Examples(
|
192 |
examples=encoder_examples,
|
193 |
inputs=input_text,
|
|
|
203 |
encoded_input = gr.Textbox(
|
204 |
label="Enter Encoded Token IDs",
|
205 |
placeholder="Paste the encoded token IDs here...",
|
206 |
+
lines=5,
|
207 |
+
interactive=True
|
208 |
)
|
209 |
decode_btn = gr.Button("🔄 Decode", variant="primary")
|
210 |
|
|
|
215 |
interactive=False
|
216 |
)
|
217 |
|
218 |
+
# Decoder button click event
|
219 |
+
decode_btn.click(
|
220 |
+
fn=decode_ids,
|
221 |
+
inputs=[encoded_input],
|
222 |
+
outputs=[decoded_output]
|
223 |
+
)
|
224 |
+
|
225 |
+
# Examples for decoder
|
226 |
gr.Examples(
|
227 |
examples=decoder_examples,
|
228 |
inputs=encoded_input,
|
229 |
outputs=decoded_output,
|
230 |
+
fn=decode_ids,
|
231 |
cache_examples=True,
|
232 |
label="Token ID Examples"
|
233 |
)
|
234 |
|
|
|
235 |
gr.Markdown("""
|
236 |
### 📝 Instructions:
|
237 |
1. **Encoding**:
|
|
|
262 |
- Typical compression ratios range from 3x to 4x
|
263 |
""")
|
264 |
|
|
|
265 |
gr.Markdown("""
|
266 |
---
|
267 |
### 📌 Version Information
|
|
|
270 |
- Last Updated: 2024
|
271 |
""")
|
272 |
|
273 |
+
# Launch the app
|
274 |
if __name__ == "__main__":
|
275 |
demo.launch(
|
276 |
share=True,
|