Spaces:
Running
Running
File size: 3,958 Bytes
a5deb8b 8502463 87989ba 69dff55 8502463 3833ac0 6e2e192 79ddc8d 6e2e192 8502463 69dff55 87989ba 69dff55 f9cbde1 69dff55 8502463 a5deb8b 7d7f8be 69dff55 f9cbde1 69dff55 7d7f8be 8502463 0a581ef 9a93602 87989ba 9a93602 6bed62f 9a93602 69dff55 0a581ef a5deb8b 9a93602 a5deb8b 9a93602 a5deb8b 8502463 a5deb8b 69dff55 8502463 f9cbde1 7d7f8be f9cbde1 8502463 0a581ef a5deb8b 8502463 f9cbde1 8502463 f9cbde1 8502463 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import random
import gradio as gr
import unicodedata
from transformers import AutoTokenizer, PreTrainedTokenizerFast
tokenizers = {
"Tabularis German Tokenizer_whiteS": PreTrainedTokenizerFast(tokenizer_file="tokenizer.json"),
"Tabularis German Tokenizer": PreTrainedTokenizerFast(tokenizer_file="tokenizer_BPE.json"),
"KoichiYasuoka/bert-base-german-upos": AutoTokenizer.from_pretrained("KoichiYasuoka/bert-base-german-upos"),
"benjamin/gerpt2-large": AutoTokenizer.from_pretrained("benjamin/gerpt2-large"),
"deepset/gbert-base": AutoTokenizer.from_pretrained("deepset/gbert-base"),
"bert-base-german-cased Tokenizer": AutoTokenizer.from_pretrained("bert-base-german-cased"),
"MiriUll/gpt2-wechsel-german_easy": AutoTokenizer.from_pretrained("MiriUll/gpt2-wechsel-german_easy"),
"DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
}
def decode_byte_token(token):
token_clean = token.replace("Ġ", "")
try:
byte_seq = bytes([ord(c) for c in token_clean])
return unicodedata.normalize("NFC", byte_seq.decode("utf-8"))
except Exception:
return token_clean
def visualize_tokens(text, tokenizer_name, show_token_ids):
tokenizer = tokenizers[tokenizer_name]
encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
token_ids = encoded["input_ids"]
tokens = tokenizer.convert_ids_to_tokens(token_ids)
def random_pastel():
r = lambda: random.randint(100, 255)
return f"rgb({r()},{r()},{r()})"
def is_special_token(token):
return (
token.startswith('[') and token.endswith(']') or
token.startswith('<') and token.endswith('>') or
token in tokenizer.all_special_tokens
)
html_tokens = []
for token in tokens:
prefix = ""
token_body = token
if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"):
prefix = "Ġ"
token_body = token[1:]
try:
byte_seq = bytes([ord(c) for c in token_body])
decoded = unicodedata.normalize("NFC", byte_seq.decode("utf-8"))
except Exception:
decoded = token_body
label = f"{prefix}{decoded}"
color = "lightgray" if is_special_token(token) else random_pastel()
html_token = f"""
<span title="{token}" style='
display:inline-block;
margin:4px;
padding:8px 12px;
background-color:{color};
border-radius:8px;
font-size:18px;
font-family:monospace;
font-weight:bold;
'>{label}</span>
"""
html_tokens.append(html_token)
html_output = "".join(html_tokens)
if show_token_ids:
html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
try:
decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
except Exception:
decoded_output = "[Could not decode using this tokenizer]"
return html_output, f"🔢 Token Count: {len(tokens)}", decoded_output
# App
with gr.Blocks() as app:
gr.Markdown("# 🚀 German Tokenizers")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(lines=4, label="Enter your text here", placeholder="Type or paste text...")
tokenizer_choice = gr.Dropdown(list(tokenizers.keys()), label="Choose Tokenizer")
show_ids = gr.Checkbox(label="Show Token IDs", value=False)
tokenize_btn = gr.Button("Tokenize!")
with gr.Column():
html_output = gr.HTML(label="Tokens Visualized")
token_count = gr.Label(label="Token Count")
decoded_output = gr.Textbox(label="Decoded Text", lines=3)
tokenize_btn.click(
visualize_tokens,
inputs=[text_input, tokenizer_choice, show_ids],
outputs=[html_output, token_count, decoded_output]
)
app.launch()
|