dnzblgn commited on
Commit
8502463
·
verified ·
1 Parent(s): f0beaa4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -0
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizers = {
5
+ "GPT-2 Tokenizer": AutoTokenizer.from_pretrained("gpt2"),
6
+ "RoBERTa Tokenizer": AutoTokenizer.from_pretrained("roberta-base"),
7
+ "DistilGPT-2 Tokenizer": AutoTokenizer.from_pretrained("distilgpt2"),
8
+ "bert-base-german-cased Tokenizer": AutoTokenizer.from_pretrained("bert-base-german-cased")
9
+ "GPT-4 Tokenizer":GPT2TokenizerFast.from_pretrained('Xenova/gpt-4')
10
+ }
11
+
12
+ # Fancy token visualization function
13
+ def visualize_tokens(text, tokenizer_name, show_token_ids):
14
+ tokenizer = tokenizers[tokenizer_name]
15
+ encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
16
+ token_ids = encoded["input_ids"]
17
+ tokens = tokenizer.convert_ids_to_tokens(token_ids)
18
+
19
+ # Display each token inside a colored span
20
+ html_tokens = []
21
+ for idx, token in enumerate(tokens):
22
+ html_token = f"<span style='display:inline-block; margin:2px; padding:4px; background-color:#eee; border-radius:6px;'>{token}</span>"
23
+ html_tokens.append(html_token)
24
+
25
+ html_output = " ".join(html_tokens)
26
+
27
+ if show_token_ids:
28
+ html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
29
+
30
+ return html_output, f"🔢 Token Count: {len(tokens)}"
31
+
32
+ # Gradio app
33
+ with gr.Blocks() as app:
34
+ gr.Markdown("# 🚀 Tokenizer Playground (Tiktokenizer-Style)")
35
+
36
+ with gr.Row():
37
+ with gr.Column():
38
+ text_input = gr.Textbox(lines=4, label="Enter your text here", placeholder="Type or paste text...")
39
+ tokenizer_choice = gr.Dropdown(list(tokenizers.keys()), label="Choose Tokenizer")
40
+ show_ids = gr.Checkbox(label="Show Token IDs", value=False)
41
+ tokenize_btn = gr.Button("Tokenize!")
42
+ with gr.Column():
43
+ html_output = gr.HTML(label="Tokens Visualized")
44
+ token_count = gr.Label(label="Token Count")
45
+
46
+ tokenize_btn.click(
47
+ visualize_tokens,
48
+ inputs=[text_input, tokenizer_choice, show_ids],
49
+ outputs=[html_output, token_count]
50
+ )
51
+
52
+ # Launch
53
+ app.launch()