|
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers |
|
|
|
def train_tokenizer(iterator, vocab_size=50000, min_frequency=3): |
|
tokenizer = Tokenizer(models.BPE(unk_token="<unk>")) |
|
|
|
|
|
tokenizer.normalizer = normalizers.Sequence([ |
|
normalizers.NFC(), |
|
normalizers.StripAccents() |
|
]) |
|
|
|
|
|
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ |
|
pre_tokenizers.WhitespaceSplit(), |
|
pre_tokenizers.Punctuation(), |
|
pre_tokenizers.Digits(individual_digits=True) |
|
]) |
|
|
|
|
|
trainer = trainers.BpeTrainer( |
|
vocab_size=vocab_size, |
|
min_frequency=min_frequency, |
|
special_tokens=["<|endoftext|>", "<pad>", "<unk>", "<mask>", "[CITATION]"], |
|
continuing_subword_prefix="" |
|
) |
|
|
|
tokenizer.train_from_iterator(iterator, trainer=trainer) |
|
tokenizer.decoder = decoders.ByteLevel() |
|
return tokenizer |