|
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers |
|
from datasets import load_dataset |
|
import re |
|
|
|
|
|
code_regex = r"""'(?:[^'\\]|\\.)*'|"(?:[^"\\]|\\.)*"|//.*|\/\*[\s\S]*?\*\/|\b(?:if|else|for|while|return|function)\b|[<>]=?|\+{1,2}|-{1,2}|&&|\|\||[!*/%^&|=-]|\d+\.\d+|\d+|\.\d+|[:;,.{}[\]()]|\p{L}+|\p{N}+|\s+|\S""" |
|
|
|
def train_tokenizer(iterator, vocab_size=32000, min_frequency=2): |
|
tokenizer = Tokenizer(models.BPE(unk_token="<unk>")) |
|
|
|
|
|
tokenizer.normalizer = normalizers.Sequence([ |
|
normalizers.NFC(), |
|
|
|
]) |
|
|
|
|
|
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ |
|
pre_tokenizers.Split(pattern=re.compile(code_regex), behavior='isolated'), |
|
pre_tokenizers.ByteLevel(add_prefix_space=False) |
|
]) |
|
|
|
|
|
trainer = trainers.BpeTrainer( |
|
vocab_size=vocab_size, |
|
min_frequency=min_frequency, |
|
special_tokens=["<|endoftext|>", "<pad>", "<unk>", "<mask>"], |
|
continuing_subword_prefix="", |
|
show_progress=True |
|
) |
|
|
|
tokenizer.train_from_iterator(iterator, trainer=trainer) |
|
tokenizer.decoder = decoders.ByteLevel() |
|
return tokenizer |