Tokenizers / app.py
dnzblgn's picture
Update app.py
f9cbde1 verified
import random
import gradio as gr
import unicodedata
from transformers import AutoTokenizer, PreTrainedTokenizerFast
tokenizers = {
"Tabularis German Tokenizer_whiteS": PreTrainedTokenizerFast(tokenizer_file="tokenizer.json"),
"Tabularis German Tokenizer": PreTrainedTokenizerFast(tokenizer_file="tokenizer_BPE.json"),
"KoichiYasuoka/bert-base-german-upos": AutoTokenizer.from_pretrained("KoichiYasuoka/bert-base-german-upos"),
"benjamin/gerpt2-large": AutoTokenizer.from_pretrained("benjamin/gerpt2-large"),
"deepset/gbert-base": AutoTokenizer.from_pretrained("deepset/gbert-base"),
"bert-base-german-cased Tokenizer": AutoTokenizer.from_pretrained("bert-base-german-cased"),
"MiriUll/gpt2-wechsel-german_easy": AutoTokenizer.from_pretrained("MiriUll/gpt2-wechsel-german_easy"),
"DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
}
def decode_byte_token(token):
token_clean = token.replace("Ġ", "")
try:
byte_seq = bytes([ord(c) for c in token_clean])
return unicodedata.normalize("NFC", byte_seq.decode("utf-8"))
except Exception:
return token_clean
def visualize_tokens(text, tokenizer_name, show_token_ids):
tokenizer = tokenizers[tokenizer_name]
encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
token_ids = encoded["input_ids"]
tokens = tokenizer.convert_ids_to_tokens(token_ids)
def random_pastel():
r = lambda: random.randint(100, 255)
return f"rgb({r()},{r()},{r()})"
def is_special_token(token):
return (
token.startswith('[') and token.endswith(']') or
token.startswith('<') and token.endswith('>') or
token in tokenizer.all_special_tokens
)
html_tokens = []
for token in tokens:
prefix = ""
token_body = token
if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"):
prefix = "Ġ"
token_body = token[1:]
try:
byte_seq = bytes([ord(c) for c in token_body])
decoded = unicodedata.normalize("NFC", byte_seq.decode("utf-8"))
except Exception:
decoded = token_body
label = f"{prefix}{decoded}"
color = "lightgray" if is_special_token(token) else random_pastel()
html_token = f"""
<span title="{token}" style='
display:inline-block;
margin:4px;
padding:8px 12px;
background-color:{color};
border-radius:8px;
font-size:18px;
font-family:monospace;
font-weight:bold;
'>{label}</span>
"""
html_tokens.append(html_token)
html_output = "".join(html_tokens)
if show_token_ids:
html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
try:
decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
except Exception:
decoded_output = "[Could not decode using this tokenizer]"
return html_output, f"🔢 Token Count: {len(tokens)}", decoded_output
# App
with gr.Blocks() as app:
gr.Markdown("# 🚀 German Tokenizers")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(lines=4, label="Enter your text here", placeholder="Type or paste text...")
tokenizer_choice = gr.Dropdown(list(tokenizers.keys()), label="Choose Tokenizer")
show_ids = gr.Checkbox(label="Show Token IDs", value=False)
tokenize_btn = gr.Button("Tokenize!")
with gr.Column():
html_output = gr.HTML(label="Tokens Visualized")
token_count = gr.Label(label="Token Count")
decoded_output = gr.Textbox(label="Decoded Text", lines=3)
tokenize_btn.click(
visualize_tokens,
inputs=[text_input, tokenizer_choice, show_ids],
outputs=[html_output, token_count, decoded_output]
)
app.launch()