AlGe's picture
Update app.py
d0428be verified
raw
history blame
2.83 kB
import gradio as gr
from transformers import AutoTokenizer
import random
# List of available tokenizers
tokenizers = [
"bert-base-uncased",
"gpt2",
"roberta-base",
"distilbert-base-uncased",
"xlnet-base-cased"
]
def generate_colored_html(tokens, decoded_tokens):
colors = ["#FFDDC1", "#C1FFD4", "#D4C1FF", "#FFC1C1", "#C1FFFD"]
text_color = "#000000"
last_color = None
background_color = "#F0F0F0"
html_tokens = []
special_token_replacements = {
'<pad>': '[Padding]',
'<s>': '[Start of Sentence]',
'</s>': '[End of Sentence]',
'<unk>': '[Unknown]',
'<mask>': '[Masked]',
'[CLS]': '[Class]',
'[SEP]': '[Separator]'
}
for i, (token, decoded_token) in enumerate(zip(tokens, decoded_tokens)):
for special_token, replacement in special_token_replacements.items():
if special_token in decoded_token:
decoded_token = decoded_token.replace(special_token, replacement)
hover_info = f"Token Index: {i}, Token: {decoded_token}, Token ID: {token}"
if '\n' in decoded_token:
color = random.choice([c for c in colors if c != last_color])
last_color = color
newline_representation = f"<span style='background-color: {color}; color: {text_color};' title='{hover_info}'>[NEWLINE]</span><br>"
html_tokens.append(newline_representation)
else:
color = random.choice([c for c in colors if c != last_color])
last_color = color
html_tokens.append(f'<span style="background-color: {color}; color: {text_color}; text-decoration: none;" title="{hover_info}">{decoded_token}</span>')
html_output = " ".join(html_tokens)
html_output = f'<div style="background-color: {background_color}; padding: 10px;">{html_output}</div>'
return html_output
def tokenize_text(text, tokenizer_name):
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokens = tokenizer.encode(text, add_special_tokens=True)
decoded_tokens = [tokenizer.decode(token) for token in tokens]
html_output = generate_colored_html(tokens, decoded_tokens)
return html_output
def compare_tokenizers(text, selected_tokenizers):
results = {}
for tokenizer_name in selected_tokenizers:
results[tokenizer_name] = tokenize_text(text, tokenizer_name)
return results
# Create the Gradio interface
iface = gr.Interface(
fn=compare_tokenizers,
inputs=[
gr.Textbox(label="Enter text to tokenize"),
gr.CheckboxGroup(choices=tokenizers, label="Select tokenizers")
],
outputs=gr.HTML(label="Tokenization Results"),
title="Tokenizer Comparison",
description="Compare tokenization results from different tokenizers.",
)
# Launch the app
iface.launch()