GPT2-PBE / train_tokenizer.py
tymbos's picture
Update train_tokenizer.py
eef1e7e verified
raw
history blame
1.44 kB
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
from datasets import load_dataset
import re
# Προσαρμοσμένος Pre-tokenizer για κώδικα
code_regex = r"""'(?:[^'\\]|\\.)*'|"(?:[^"\\]|\\.)*"|//.*|\/\*[\s\S]*?\*\/|\b(?:if|else|for|while|return|function)\b|[<>]=?|\+{1,2}|-{1,2}|&&|\|\||[!*/%^&|=-]|\d+\.\d+|\d+|\.\d+|[:;,.{}[\]()]|\p{L}+|\p{N}+|\s+|\S"""
def train_tokenizer(iterator, vocab_size=32000, min_frequency=2):
tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
# Προχωρημένο Normalization
tokenizer.normalizer = normalizers.Sequence([
normalizers.NFC(),
#normalizers.StripAccents() # Προαιρετικό για τόνους
])
# Προσαρμοσμένος Pre-tokenizer με Split
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
pre_tokenizers.Split(pattern=re.compile(code_regex), behavior='isolated'),
pre_tokenizers.ByteLevel(add_prefix_space=False)
])
# Προχωρημένος Trainer
trainer = trainers.BpeTrainer(
vocab_size=vocab_size,
min_frequency=min_frequency,
special_tokens=["<|endoftext|>", "<pad>", "<unk>", "<mask>"],
continuing_subword_prefix="",
show_progress=True
)
tokenizer.train_from_iterator(iterator, trainer=trainer)
tokenizer.decoder = decoders.ByteLevel()
return tokenizer