Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -4,7 +4,7 @@ from html import escape
|
|
4 |
from transformers import AutoTokenizer
|
5 |
|
6 |
|
7 |
-
def get_available_models():
|
8 |
"""获取models目录下所有包含tokenizer.json的模型"""
|
9 |
models_dir = "models"
|
10 |
if not os.path.exists(models_dir):
|
@@ -13,7 +13,7 @@ def get_available_models():
|
|
13 |
available_models = []
|
14 |
for model_name in os.listdir(models_dir):
|
15 |
model_path = os.path.join(models_dir, model_name)
|
16 |
-
tokenizer_file = os.path.join(model_path, "
|
17 |
|
18 |
if os.path.isdir(model_path) and os.path.isfile(tokenizer_file):
|
19 |
available_models.append(model_name)
|
@@ -21,17 +21,26 @@ def get_available_models():
|
|
21 |
return sorted(available_models)
|
22 |
|
23 |
|
24 |
-
def tokenize_text(
|
|
|
|
|
25 |
"""处理tokenize请求"""
|
26 |
-
if not
|
27 |
return "Please choose a model and input some texts", 0, 0
|
28 |
if not text:
|
29 |
text = "Please choose a model and input some texts"
|
30 |
|
31 |
try:
|
32 |
# 加载tokenizer
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
# Tokenize处理
|
37 |
input_ids = tokenizer.encode(text, add_special_tokens=True)
|
@@ -63,7 +72,7 @@ def tokenize_text(model_name, text):
|
|
63 |
|
64 |
except Exception as e:
|
65 |
error_msg = f"Error: {str(e)}"
|
66 |
-
return error_msg,
|
67 |
|
68 |
|
69 |
banner_md = """# 🎨 Tokenize it!
|
@@ -73,7 +82,12 @@ Powerful token visualization tool for your text inputs. 🚀
|
|
73 |
Works for LLMs both online and *locally* on your machine!"""
|
74 |
banner = gr.Markdown(banner_md)
|
75 |
model_selector = gr.Dropdown(
|
76 |
-
label="
|
|
|
|
|
|
|
|
|
|
|
77 |
)
|
78 |
text_input = gr.Textbox(label="Input Text", placeholder="Hello World!", lines=4)
|
79 |
submit_btn = gr.Button("🚀 Tokenize!", variant="primary")
|
@@ -86,7 +100,10 @@ with gr.Blocks(title="Token Visualizer", theme="NoCrypt/miku") as webui:
|
|
86 |
banner.render()
|
87 |
|
88 |
with gr.Column():
|
89 |
-
|
|
|
|
|
|
|
90 |
text_input.render()
|
91 |
submit_btn.render()
|
92 |
|
@@ -120,7 +137,7 @@ with gr.Blocks(title="Token Visualizer", theme="NoCrypt/miku") as webui:
|
|
120 |
|
121 |
submit_btn.click(
|
122 |
fn=tokenize_text,
|
123 |
-
inputs=[model_selector, text_input],
|
124 |
outputs=[output_html, token_count, char_count],
|
125 |
)
|
126 |
|
|
|
4 |
from transformers import AutoTokenizer
|
5 |
|
6 |
|
7 |
+
def get_available_models() -> list[str]:
|
8 |
"""获取models目录下所有包含tokenizer.json的模型"""
|
9 |
models_dir = "models"
|
10 |
if not os.path.exists(models_dir):
|
|
|
13 |
available_models = []
|
14 |
for model_name in os.listdir(models_dir):
|
15 |
model_path = os.path.join(models_dir, model_name)
|
16 |
+
tokenizer_file = os.path.join(model_path, "tokenizer.json")
|
17 |
|
18 |
if os.path.isdir(model_path) and os.path.isfile(tokenizer_file):
|
19 |
available_models.append(model_name)
|
|
|
21 |
return sorted(available_models)
|
22 |
|
23 |
|
24 |
+
def tokenize_text(
|
25 |
+
builtin_model: str, custom_model: str | None, text: str
|
26 |
+
) -> tuple[str | None, int, int]:
|
27 |
"""处理tokenize请求"""
|
28 |
+
if not builtin_model:
|
29 |
return "Please choose a model and input some texts", 0, 0
|
30 |
if not text:
|
31 |
text = "Please choose a model and input some texts"
|
32 |
|
33 |
try:
|
34 |
# 加载tokenizer
|
35 |
+
if custom_model:
|
36 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
37 |
+
custom_model, trust_remote_code=True, device_map="cpu"
|
38 |
+
)
|
39 |
+
else:
|
40 |
+
model_path = os.path.join("models", builtin_model)
|
41 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
42 |
+
model_path, trust_remote_code=True, device_map="cpu"
|
43 |
+
)
|
44 |
|
45 |
# Tokenize处理
|
46 |
input_ids = tokenizer.encode(text, add_special_tokens=True)
|
|
|
72 |
|
73 |
except Exception as e:
|
74 |
error_msg = f"Error: {str(e)}"
|
75 |
+
return error_msg, 0, 0
|
76 |
|
77 |
|
78 |
banner_md = """# 🎨 Tokenize it!
|
|
|
82 |
Works for LLMs both online and *locally* on your machine!"""
|
83 |
banner = gr.Markdown(banner_md)
|
84 |
model_selector = gr.Dropdown(
|
85 |
+
label="Built-in Model", choices=get_available_models(), interactive=True
|
86 |
+
)
|
87 |
+
custom_model = gr.Textbox(
|
88 |
+
label="Custom Model",
|
89 |
+
placeholder="Enter your custom model name. e.g. Qwen/QwQ",
|
90 |
+
lines=1,
|
91 |
)
|
92 |
text_input = gr.Textbox(label="Input Text", placeholder="Hello World!", lines=4)
|
93 |
submit_btn = gr.Button("🚀 Tokenize!", variant="primary")
|
|
|
100 |
banner.render()
|
101 |
|
102 |
with gr.Column():
|
103 |
+
with gr.TabItem("Built-in Model"):
|
104 |
+
model_selector.render()
|
105 |
+
with gr.TabItem("Custom Model"):
|
106 |
+
custom_model.render()
|
107 |
text_input.render()
|
108 |
submit_btn.render()
|
109 |
|
|
|
137 |
|
138 |
submit_btn.click(
|
139 |
fn=tokenize_text,
|
140 |
+
inputs=[model_selector, custom_model, text_input],
|
141 |
outputs=[output_html, token_count, char_count],
|
142 |
)
|
143 |
|