dnzblgn commited on
Commit
a5deb8b
·
verified ·
1 Parent(s): a915aa6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -6
app.py CHANGED
@@ -1,27 +1,46 @@
 
1
  import gradio as gr
2
  from transformers import AutoTokenizer
3
 
 
4
  tokenizers = {
5
  "GPT-2 Tokenizer": AutoTokenizer.from_pretrained("gpt2"),
6
  "RoBERTa Tokenizer": AutoTokenizer.from_pretrained("roberta-base"),
7
  "DistilGPT-2 Tokenizer": AutoTokenizer.from_pretrained("distilgpt2"),
8
- "bert-base-german-cased Tokenizer": AutoTokenizer.from_pretrained("bert-base-german-cased"),
9
  }
10
 
11
- # Fancy token visualization function
12
  def visualize_tokens(text, tokenizer_name, show_token_ids):
13
  tokenizer = tokenizers[tokenizer_name]
14
  encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
15
  token_ids = encoded["input_ids"]
16
  tokens = tokenizer.convert_ids_to_tokens(token_ids)
17
 
18
- # Display each token inside a colored span
 
 
 
 
 
19
  html_tokens = []
20
  for idx, token in enumerate(tokens):
21
- html_token = f"<span style='display:inline-block; margin:2px; padding:4px; background-color:#eee; border-radius:6px;'>{token}</span>"
 
 
 
 
 
 
 
 
 
 
 
 
22
  html_tokens.append(html_token)
23
 
24
- html_output = " ".join(html_tokens)
25
 
26
  if show_token_ids:
27
  html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
@@ -31,7 +50,7 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
31
  # Gradio app
32
  with gr.Blocks() as app:
33
  gr.Markdown("# 🚀 Tokenizer Playground (Tiktokenizer-Style)")
34
-
35
  with gr.Row():
36
  with gr.Column():
37
  text_input = gr.Textbox(lines=4, label="Enter your text here", placeholder="Type or paste text...")
 
1
+ import random
2
  import gradio as gr
3
  from transformers import AutoTokenizer
4
 
5
+ # Load all tokenizers (at startup)
6
  tokenizers = {
7
  "GPT-2 Tokenizer": AutoTokenizer.from_pretrained("gpt2"),
8
  "RoBERTa Tokenizer": AutoTokenizer.from_pretrained("roberta-base"),
9
  "DistilGPT-2 Tokenizer": AutoTokenizer.from_pretrained("distilgpt2"),
10
+ "bert-base-german-cased Tokenizer": AutoTokenizer.from_pretrained("bert-base-german-cased")
11
  }
12
 
13
+ # Fancy token visualization with random colors
14
  def visualize_tokens(text, tokenizer_name, show_token_ids):
15
  tokenizer = tokenizers[tokenizer_name]
16
  encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
17
  token_ids = encoded["input_ids"]
18
  tokens = tokenizer.convert_ids_to_tokens(token_ids)
19
 
20
+ # Generate a random pastel color
21
+ def random_pastel():
22
+ r = lambda: random.randint(100, 255)
23
+ return f"rgb({r()},{r()},{r()})"
24
+
25
+ # Create HTML tokens with random colors and bigger size
26
  html_tokens = []
27
  for idx, token in enumerate(tokens):
28
+ color = random_pastel()
29
+ html_token = f"""
30
+ <span style='
31
+ display:inline-block;
32
+ margin:4px;
33
+ padding:8px 12px;
34
+ background-color:{color};
35
+ border-radius:8px;
36
+ font-size:18px;
37
+ font-family:monospace;
38
+ font-weight:bold;
39
+ '>{token}</span>
40
+ """
41
  html_tokens.append(html_token)
42
 
43
+ html_output = "".join(html_tokens)
44
 
45
  if show_token_ids:
46
  html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
 
50
  # Gradio app
51
  with gr.Blocks() as app:
52
  gr.Markdown("# 🚀 Tokenizer Playground (Tiktokenizer-Style)")
53
+
54
  with gr.Row():
55
  with gr.Column():
56
  text_input = gr.Textbox(lines=4, label="Enter your text here", placeholder="Type or paste text...")