Orion-zhen commited on
Commit
a5b8d61
·
verified ·
1 Parent(s): 12460d4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -10
app.py CHANGED
@@ -4,7 +4,7 @@ from html import escape
4
  from transformers import AutoTokenizer
5
 
6
 
7
- def get_available_models():
8
  """获取models目录下所有包含tokenizer.json的模型"""
9
  models_dir = "models"
10
  if not os.path.exists(models_dir):
@@ -13,7 +13,7 @@ def get_available_models():
13
  available_models = []
14
  for model_name in os.listdir(models_dir):
15
  model_path = os.path.join(models_dir, model_name)
16
- tokenizer_file = os.path.join(model_path, "config.json")
17
 
18
  if os.path.isdir(model_path) and os.path.isfile(tokenizer_file):
19
  available_models.append(model_name)
@@ -21,17 +21,26 @@ def get_available_models():
21
  return sorted(available_models)
22
 
23
 
24
- def tokenize_text(model_name, text):
 
 
25
  """处理tokenize请求"""
26
- if not model_name:
27
  return "Please choose a model and input some texts", 0, 0
28
  if not text:
29
  text = "Please choose a model and input some texts"
30
 
31
  try:
32
  # 加载tokenizer
33
- model_path = os.path.join("models", model_name)
34
- tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, device_map="cpu")
 
 
 
 
 
 
 
35
 
36
  # Tokenize处理
37
  input_ids = tokenizer.encode(text, add_special_tokens=True)
@@ -63,7 +72,7 @@ def tokenize_text(model_name, text):
63
 
64
  except Exception as e:
65
  error_msg = f"Error: {str(e)}"
66
- return error_msg, ""
67
 
68
 
69
  banner_md = """# 🎨 Tokenize it!
@@ -73,7 +82,12 @@ Powerful token visualization tool for your text inputs. 🚀
73
  Works for LLMs both online and *locally* on your machine!"""
74
  banner = gr.Markdown(banner_md)
75
  model_selector = gr.Dropdown(
76
- label="Choose Model", choices=get_available_models(), interactive=True
 
 
 
 
 
77
  )
78
  text_input = gr.Textbox(label="Input Text", placeholder="Hello World!", lines=4)
79
  submit_btn = gr.Button("🚀 Tokenize!", variant="primary")
@@ -86,7 +100,10 @@ with gr.Blocks(title="Token Visualizer", theme="NoCrypt/miku") as webui:
86
  banner.render()
87
 
88
  with gr.Column():
89
- model_selector.render()
 
 
 
90
  text_input.render()
91
  submit_btn.render()
92
 
@@ -120,7 +137,7 @@ with gr.Blocks(title="Token Visualizer", theme="NoCrypt/miku") as webui:
120
 
121
  submit_btn.click(
122
  fn=tokenize_text,
123
- inputs=[model_selector, text_input],
124
  outputs=[output_html, token_count, char_count],
125
  )
126
 
 
4
  from transformers import AutoTokenizer
5
 
6
 
7
+ def get_available_models() -> list[str]:
8
  """获取models目录下所有包含tokenizer.json的模型"""
9
  models_dir = "models"
10
  if not os.path.exists(models_dir):
 
13
  available_models = []
14
  for model_name in os.listdir(models_dir):
15
  model_path = os.path.join(models_dir, model_name)
16
+ tokenizer_file = os.path.join(model_path, "tokenizer.json")
17
 
18
  if os.path.isdir(model_path) and os.path.isfile(tokenizer_file):
19
  available_models.append(model_name)
 
21
  return sorted(available_models)
22
 
23
 
24
+ def tokenize_text(
25
+ builtin_model: str, custom_model: str | None, text: str
26
+ ) -> tuple[str | None, int, int]:
27
  """处理tokenize请求"""
28
+ if not builtin_model:
29
  return "Please choose a model and input some texts", 0, 0
30
  if not text:
31
  text = "Please choose a model and input some texts"
32
 
33
  try:
34
  # 加载tokenizer
35
+ if custom_model:
36
+ tokenizer = AutoTokenizer.from_pretrained(
37
+ custom_model, trust_remote_code=True, device_map="cpu"
38
+ )
39
+ else:
40
+ model_path = os.path.join("models", builtin_model)
41
+ tokenizer = AutoTokenizer.from_pretrained(
42
+ model_path, trust_remote_code=True, device_map="cpu"
43
+ )
44
 
45
  # Tokenize处理
46
  input_ids = tokenizer.encode(text, add_special_tokens=True)
 
72
 
73
  except Exception as e:
74
  error_msg = f"Error: {str(e)}"
75
+ return error_msg, 0, 0
76
 
77
 
78
  banner_md = """# 🎨 Tokenize it!
 
82
  Works for LLMs both online and *locally* on your machine!"""
83
  banner = gr.Markdown(banner_md)
84
  model_selector = gr.Dropdown(
85
+ label="Built-in Model", choices=get_available_models(), interactive=True
86
+ )
87
+ custom_model = gr.Textbox(
88
+ label="Custom Model",
89
+ placeholder="Enter your custom model name. e.g. Qwen/QwQ",
90
+ lines=1,
91
  )
92
  text_input = gr.Textbox(label="Input Text", placeholder="Hello World!", lines=4)
93
  submit_btn = gr.Button("🚀 Tokenize!", variant="primary")
 
100
  banner.render()
101
 
102
  with gr.Column():
103
+ with gr.TabItem("Built-in Model"):
104
+ model_selector.render()
105
+ with gr.TabItem("Custom Model"):
106
+ custom_model.render()
107
  text_input.render()
108
  submit_btn.render()
109
 
 
137
 
138
  submit_btn.click(
139
  fn=tokenize_text,
140
+ inputs=[model_selector, custom_model, text_input],
141
  outputs=[output_html, token_count, char_count],
142
  )
143