Orion-zhen commited on
Commit
11bf0d3
·
verified ·
1 Parent(s): 2f7b83a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -29
app.py CHANGED
@@ -22,26 +22,54 @@ def get_available_models() -> list[str]:
22
 
23
 
24
  def tokenize_text(
25
- builtin_model: str, custom_model: str | None, text: str
26
- ) -> tuple[str | None, int, int]:
27
  """处理tokenize请求"""
28
- if not builtin_model:
29
- return "Please choose a model and input some texts", 0, 0
30
  if not text:
31
  text = "Please choose a model and input some texts"
32
 
33
  try:
34
  # 加载tokenizer
35
- if custom_model:
 
36
  tokenizer = AutoTokenizer.from_pretrained(
37
- custom_model, trust_remote_code=True, device_map="cpu"
38
  )
39
  else:
40
- model_path = os.path.join("models", builtin_model)
41
  tokenizer = AutoTokenizer.from_pretrained(
42
- model_path, trust_remote_code=True, device_map="cpu"
43
  )
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  # Tokenize处理
46
  input_ids = tokenizer.encode(text, add_special_tokens=True)
47
 
@@ -68,11 +96,18 @@ def tokenize_text(
68
  token_len = len(input_ids)
69
  char_len = len(text)
70
 
71
- return "".join(html_parts), token_len, char_len
 
 
 
 
 
 
 
72
 
73
  except Exception as e:
74
  error_msg = f"Error: {str(e)}"
75
- return error_msg, 0, 0
76
 
77
 
78
  banner_md = """# 🎨 Tokenize it!
@@ -82,16 +117,18 @@ Powerful token visualization tool for your text inputs. 🚀
82
  Works for LLMs both online and *locally* on your machine!"""
83
  banner = gr.Markdown(banner_md)
84
  model_selector = gr.Dropdown(
85
- label="Built-in Model", choices=get_available_models(), interactive=True
86
- )
87
- custom_model = gr.Textbox(
88
- label="Custom Model",
89
- placeholder="Enter your custom model name. e.g. Qwen/QwQ-32B. To use built-in models, please keep this EMPTY!",
90
- lines=1,
91
  )
92
  text_input = gr.Textbox(label="Input Text", placeholder="Hello World!", lines=4)
93
  submit_btn = gr.Button("🚀 Tokenize!", variant="primary")
94
 
 
 
 
 
95
  output_html = gr.HTML(label="Tokenized Output", elem_classes="token-output")
96
  token_count = gr.Number(label="Token Count", value=0, interactive=False)
97
  char_count = gr.Number(label="Character Count", value=0, interactive=False)
@@ -99,19 +136,24 @@ char_count = gr.Number(label="Character Count", value=0, interactive=False)
99
  with gr.Blocks(title="Token Visualizer", theme="NoCrypt/miku") as webui:
100
  banner.render()
101
 
102
- with gr.Column():
103
- with gr.TabItem("Built-in Model"):
104
- model_selector.render()
105
- with gr.TabItem("Custom Model"):
106
- custom_model.render()
107
- text_input.render()
108
- submit_btn.render()
109
-
110
  with gr.Column():
111
  with gr.Row():
112
- token_count.render()
113
- char_count.render()
114
- output_html.render()
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  # 定义CSS样式
117
  webui.css = """
@@ -137,8 +179,15 @@ with gr.Blocks(title="Token Visualizer", theme="NoCrypt/miku") as webui:
137
 
138
  submit_btn.click(
139
  fn=tokenize_text,
140
- inputs=[model_selector, custom_model, text_input],
141
- outputs=[output_html, token_count, char_count],
 
 
 
 
 
 
 
142
  )
143
 
144
  if __name__ == "__main__":
 
22
 
23
 
24
  def tokenize_text(
25
+ model_name: str, text: str
26
+ ) -> tuple[str | None, str | None, int | None, dict | None, int, int]:
27
  """处理tokenize请求"""
28
+ if not model_name:
29
+ return "Please choose a model and input some texts", None, None, None, 0, 0
30
  if not text:
31
  text = "Please choose a model and input some texts"
32
 
33
  try:
34
  # 加载tokenizer
35
+ model_path = os.path.join("models", model_name)
36
+ if os.path.isdir(model_path):
37
  tokenizer = AutoTokenizer.from_pretrained(
38
+ model_path, trust_remote_code=True, device_map="cpu"
39
  )
40
  else:
 
41
  tokenizer = AutoTokenizer.from_pretrained(
42
+ model_name, trust_remote_code=True, device_map="cpu"
43
  )
44
 
45
+ tokenizer_type = tokenizer.__class__.__name__
46
+
47
+ if hasattr(tokenizer, "vocab_size"):
48
+ vocab_size = tokenizer.vocab_size
49
+ elif hasattr(tokenizer, "get_vocab"):
50
+ vocab_size = len(tokenizer.get_vocab())
51
+ else:
52
+ vocab_size = -1
53
+
54
+ sp_token_list = [
55
+ "pad_token",
56
+ "eos_token",
57
+ "bos_token",
58
+ "sep_token",
59
+ "cls_token",
60
+ "unk_token",
61
+ "mask_token",
62
+ ]
63
+ special_tokens = {}
64
+ for token_name in sp_token_list:
65
+ if (
66
+ hasattr(tokenizer, token_name)
67
+ and getattr(tokenizer, token_name) is not None
68
+ ):
69
+ token_value = getattr(tokenizer, token_name)
70
+ if token_value and str(token_value).strip():
71
+ special_tokens[token_name] = str(token_value)
72
+
73
  # Tokenize处理
74
  input_ids = tokenizer.encode(text, add_special_tokens=True)
75
 
 
96
  token_len = len(input_ids)
97
  char_len = len(text)
98
 
99
+ return (
100
+ "".join(html_parts),
101
+ tokenizer_type,
102
+ vocab_size,
103
+ special_tokens,
104
+ token_len,
105
+ char_len,
106
+ )
107
 
108
  except Exception as e:
109
  error_msg = f"Error: {str(e)}"
110
+ return error_msg, None, None, None, 0, 0
111
 
112
 
113
  banner_md = """# 🎨 Tokenize it!
 
117
  Works for LLMs both online and *locally* on your machine!"""
118
  banner = gr.Markdown(banner_md)
119
  model_selector = gr.Dropdown(
120
+ label="Choose or enter model name",
121
+ choices=get_available_models(),
122
+ interactive=True,
123
+ allow_custom_value=True,
 
 
124
  )
125
  text_input = gr.Textbox(label="Input Text", placeholder="Hello World!", lines=4)
126
  submit_btn = gr.Button("🚀 Tokenize!", variant="primary")
127
 
128
+ tokenizer_type = gr.Textbox(label="Tokenizer Type", interactive=False)
129
+ vocab_size = gr.Number(label="Vocab Size", interactive=False)
130
+ sp_tokens = gr.JSON(label="Special Tokens")
131
+
132
  output_html = gr.HTML(label="Tokenized Output", elem_classes="token-output")
133
  token_count = gr.Number(label="Token Count", value=0, interactive=False)
134
  char_count = gr.Number(label="Character Count", value=0, interactive=False)
 
136
  with gr.Blocks(title="Token Visualizer", theme="NoCrypt/miku") as webui:
137
  banner.render()
138
 
 
 
 
 
 
 
 
 
139
  with gr.Column():
140
  with gr.Row():
141
+ with gr.Column():
142
+ model_selector.render()
143
+ text_input.render()
144
+ submit_btn.render()
145
+ with gr.Column():
146
+ with gr.Accordion("Details", open=False):
147
+ with gr.Row():
148
+ tokenizer_type.render()
149
+ vocab_size.render()
150
+ sp_tokens.render()
151
+ with gr.Row():
152
+ token_count.render()
153
+ char_count.render()
154
+
155
+ with gr.Column():
156
+ output_html.render()
157
 
158
  # 定义CSS样式
159
  webui.css = """
 
179
 
180
  submit_btn.click(
181
  fn=tokenize_text,
182
+ inputs=[model_selector, text_input],
183
+ outputs=[
184
+ output_html,
185
+ tokenizer_type,
186
+ vocab_size,
187
+ sp_tokens,
188
+ token_count,
189
+ char_count,
190
+ ],
191
  )
192
 
193
  if __name__ == "__main__":