Spaces:
Running
Running
import gradio as gr | |
from transformers import AutoTokenizer | |
import random | |
# List of available tokenizers | |
tokenizers = [ | |
"bert-base-uncased", | |
"gpt2", | |
"roberta-base", | |
"distilbert-base-uncased", | |
"xlnet-base-cased" | |
] | |
def generate_colored_html(tokens, decoded_tokens): | |
colors = ["#FFDDC1", "#C1FFD4", "#D4C1FF", "#FFC1C1", "#C1FFFD"] | |
text_color = "#000000" | |
last_color = None | |
background_color = "#F0F0F0" | |
html_tokens = [] | |
special_token_replacements = { | |
'<pad>': '[Padding]', | |
'<s>': '[Start of Sentence]', | |
'</s>': '[End of Sentence]', | |
'<unk>': '[Unknown]', | |
'<mask>': '[Masked]', | |
'[CLS]': '[Class]', | |
'[SEP]': '[Separator]' | |
} | |
for i, (token, decoded_token) in enumerate(zip(tokens, decoded_tokens)): | |
for special_token, replacement in special_token_replacements.items(): | |
if special_token in decoded_token: | |
decoded_token = decoded_token.replace(special_token, replacement) | |
hover_info = f"Token Index: {i}, Token: {decoded_token}, Token ID: {token}" | |
if '\n' in decoded_token: | |
color = random.choice([c for c in colors if c != last_color]) | |
last_color = color | |
newline_representation = f"<span style='background-color: {color}; color: {text_color};' title='{hover_info}'>[NEWLINE]</span><br>" | |
html_tokens.append(newline_representation) | |
else: | |
color = random.choice([c for c in colors if c != last_color]) | |
last_color = color | |
html_tokens.append(f'<span style="background-color: {color}; color: {text_color}; text-decoration: none;" title="{hover_info}">{decoded_token}</span>') | |
html_output = " ".join(html_tokens) | |
html_output = f'<div style="background-color: {background_color}; padding: 10px;">{html_output}</div>' | |
return html_output | |
def tokenize_text(text, tokenizer_name): | |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) | |
tokens = tokenizer.encode(text, add_special_tokens=True) | |
decoded_tokens = [tokenizer.decode(token) for token in tokens] | |
html_output = generate_colored_html(tokens, decoded_tokens) | |
return html_output | |
def compare_tokenizers(text, selected_tokenizers): | |
results = {} | |
for tokenizer_name in selected_tokenizers: | |
results[tokenizer_name] = tokenize_text(text, tokenizer_name) | |
return results | |
# Create the Gradio interface | |
iface = gr.Interface( | |
fn=compare_tokenizers, | |
inputs=[ | |
gr.Textbox(label="Enter text to tokenize"), | |
gr.CheckboxGroup(choices=tokenizers, label="Select tokenizers") | |
], | |
outputs=gr.HTML(label="Tokenization Results"), | |
title="Tokenizer Comparison", | |
description="Compare tokenization results from different tokenizers.", | |
) | |
# Launch the app | |
iface.launch() | |