Spaces:
Running
Running
import random | |
import gradio as gr | |
import unicodedata | |
from transformers import AutoTokenizer, PreTrainedTokenizerFast | |
tokenizers = { | |
"Tabularis German Tokenizer_whiteS": PreTrainedTokenizerFast(tokenizer_file="tokenizer.json"), | |
"Tabularis German Tokenizer": PreTrainedTokenizerFast(tokenizer_file="tokenizer_BPE.json"), | |
"KoichiYasuoka/bert-base-german-upos": AutoTokenizer.from_pretrained("KoichiYasuoka/bert-base-german-upos"), | |
"benjamin/gerpt2-large": AutoTokenizer.from_pretrained("benjamin/gerpt2-large"), | |
"deepset/gbert-base": AutoTokenizer.from_pretrained("deepset/gbert-base"), | |
"bert-base-german-cased Tokenizer": AutoTokenizer.from_pretrained("bert-base-german-cased"), | |
"MiriUll/gpt2-wechsel-german_easy": AutoTokenizer.from_pretrained("MiriUll/gpt2-wechsel-german_easy"), | |
"DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1") | |
} | |
def decode_byte_token(token): | |
token_clean = token.replace("Ġ", "") | |
try: | |
byte_seq = bytes([ord(c) for c in token_clean]) | |
return unicodedata.normalize("NFC", byte_seq.decode("utf-8")) | |
except Exception: | |
return token_clean | |
def visualize_tokens(text, tokenizer_name, show_token_ids): | |
tokenizer = tokenizers[tokenizer_name] | |
encoded = tokenizer(text, add_special_tokens=False, return_tensors=None) | |
token_ids = encoded["input_ids"] | |
tokens = tokenizer.convert_ids_to_tokens(token_ids) | |
def random_pastel(): | |
r = lambda: random.randint(100, 255) | |
return f"rgb({r()},{r()},{r()})" | |
def is_special_token(token): | |
return ( | |
token.startswith('[') and token.endswith(']') or | |
token.startswith('<') and token.endswith('>') or | |
token in tokenizer.all_special_tokens | |
) | |
html_tokens = [] | |
for token in tokens: | |
prefix = "" | |
token_body = token | |
if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"): | |
prefix = "Ġ" | |
token_body = token[1:] | |
try: | |
byte_seq = bytes([ord(c) for c in token_body]) | |
decoded = unicodedata.normalize("NFC", byte_seq.decode("utf-8")) | |
except Exception: | |
decoded = token_body | |
label = f"{prefix}{decoded}" | |
color = "lightgray" if is_special_token(token) else random_pastel() | |
html_token = f""" | |
<span title="{token}" style=' | |
display:inline-block; | |
margin:4px; | |
padding:8px 12px; | |
background-color:{color}; | |
border-radius:8px; | |
font-size:18px; | |
font-family:monospace; | |
font-weight:bold; | |
'>{label}</span> | |
""" | |
html_tokens.append(html_token) | |
html_output = "".join(html_tokens) | |
if show_token_ids: | |
html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids) | |
try: | |
decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True) | |
except Exception: | |
decoded_output = "[Could not decode using this tokenizer]" | |
return html_output, f"🔢 Token Count: {len(tokens)}", decoded_output | |
# App | |
with gr.Blocks() as app: | |
gr.Markdown("# 🚀 German Tokenizers") | |
with gr.Row(): | |
with gr.Column(): | |
text_input = gr.Textbox(lines=4, label="Enter your text here", placeholder="Type or paste text...") | |
tokenizer_choice = gr.Dropdown(list(tokenizers.keys()), label="Choose Tokenizer") | |
show_ids = gr.Checkbox(label="Show Token IDs", value=False) | |
tokenize_btn = gr.Button("Tokenize!") | |
with gr.Column(): | |
html_output = gr.HTML(label="Tokens Visualized") | |
token_count = gr.Label(label="Token Count") | |
decoded_output = gr.Textbox(label="Decoded Text", lines=3) | |
tokenize_btn.click( | |
visualize_tokens, | |
inputs=[text_input, tokenizer_choice, show_ids], | |
outputs=[html_output, token_count, decoded_output] | |
) | |
app.launch() | |