Orion-zhen commited on
Commit
dd1b211
·
verified ·
1 Parent(s): 6bbd6e4

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -0
app.py CHANGED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from html import escape
4
+ from transformers import AutoTokenizer
5
+
6
+
7
+ def get_available_models():
8
+ """获取models目录下所有包含tokenizer.json的模型"""
9
+ models_dir = "models"
10
+ if not os.path.exists(models_dir):
11
+ return []
12
+
13
+ available_models = []
14
+ for model_name in os.listdir(models_dir):
15
+ model_path = os.path.join(models_dir, model_name)
16
+ tokenizer_file = os.path.join(model_path, "tokenizer.json")
17
+
18
+ if os.path.isdir(model_path) and os.path.isfile(tokenizer_file):
19
+ available_models.append(model_name)
20
+
21
+ return sorted(available_models)
22
+
23
+
24
+ def tokenize_text(model_name, text):
25
+ """处理tokenize请求"""
26
+ if not model_name:
27
+ return "Please choose a model and input some texts", 0, 0
28
+ if not text:
29
+ text = "Please choose a model and input some texts"
30
+
31
+ try:
32
+ # 加载tokenizer
33
+ model_path = os.path.join("models", model_name)
34
+ tokenizer = AutoTokenizer.from_pretrained(model_path, device_map="cpu")
35
+
36
+ # Tokenize处理
37
+ input_ids = tokenizer.encode(text, add_special_tokens=True)
38
+
39
+ # 生成带颜色的HTML
40
+ colors = ["#A8D8EA", "#AA96DA", "#FCBAD3"]
41
+ html_parts = []
42
+
43
+ for i, token_id in enumerate(input_ids):
44
+ # 转义HTML特殊字符
45
+ safe_token = escape(tokenizer.decode(token_id))
46
+ # 交替颜色
47
+ color = colors[i % len(colors)]
48
+ html_part = (
49
+ f'<span style="background-color: {color};'
50
+ f"margin: 2px; padding: 2px 5px; border-radius: 3px;"
51
+ f'display: inline-block; font-size: 1.2em;">'
52
+ f"{safe_token}<br/>"
53
+ f'<sub style="font-size: 0.9em;">{token_id}</sub>'
54
+ f"</span>"
55
+ )
56
+ html_parts.append(html_part)
57
+
58
+ # 统计信息
59
+ token_len = len(input_ids)
60
+ char_len = len(text)
61
+
62
+ return "".join(html_parts), token_len, char_len
63
+
64
+ except Exception as e:
65
+ error_msg = f"Error: {str(e)}"
66
+ return error_msg, ""
67
+
68
+
69
+ banner_md = """# 🎨 Tokenize it!
70
+
71
+ Powerful token visualization tool for your text inputs. 🚀
72
+
73
+ Works for LLMs both online and *locally* on your machine!"""
74
+ banner = gr.Markdown(banner_md)
75
+ model_selector = gr.Dropdown(
76
+ label="Choose Model", choices=get_available_models(), interactive=True
77
+ )
78
+ text_input = gr.Textbox(label="Input Text", placeholder="Hello World!", lines=4)
79
+ submit_btn = gr.Button("🚀 Tokenize!", variant="primary")
80
+
81
+ output_html = gr.HTML(label="Tokenized Output", elem_classes="token-output")
82
+ token_count = gr.Number(label="Token Count", value=0, interactive=False)
83
+ char_count = gr.Number(label="Character Count", value=0, interactive=False)
84
+
85
+ with gr.Blocks(title="Token Visualizer", theme="NoCrypt/miku") as webui:
86
+ banner.render()
87
+
88
+ with gr.Column():
89
+ model_selector.render()
90
+ text_input.render()
91
+ submit_btn.render()
92
+
93
+ with gr.Column():
94
+ with gr.Row():
95
+ token_count.render()
96
+ char_count.render()
97
+ output_html.render()
98
+
99
+ # 定义CSS样式
100
+ webui.css = """
101
+ .token-output span {
102
+ margin: 3px;
103
+ vertical-align: top;
104
+ }
105
+ .stats-output {
106
+ font-weight: bold !important;
107
+ color: #2c3e50 !important;
108
+ }
109
+ .gradio-container { /* 针对 Gradio 的主容器 */
110
+ width: 100%; /* 根据需要调整宽度 */
111
+ display: flex;
112
+ justify-content: center;
113
+ align-items: center;
114
+ }
115
+ .gradio-container > div { /* 直接子元素,通常包含你的内容 */
116
+ width: 90%; /* 或者你想要的固定宽度 */
117
+ max-width: 1200px; /* 设置最大宽度 */
118
+ }
119
+ """
120
+
121
+ submit_btn.click(
122
+ fn=tokenize_text,
123
+ inputs=[model_selector, text_input],
124
+ outputs=[output_html, token_count, char_count],
125
+ )
126
+
127
+ if __name__ == "__main__":
128
+ os.makedirs("models", exist_ok=True)
129
+ webui.launch(pwa=True)