File size: 497 Bytes
b410583 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
from tokenizers import ByteLevelBPETokenizer
paths = ['train_code.txt', 'train_doc.txt']
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()
# Customize training
tokenizer.train(files=paths, vocab_size=32000, min_frequency=3, special_tokens=[
"<pad>",
"<s>",
"</s>",
"<unk>",
"<mask>"
])
# Save files to disk
tokenizer.save_model("./salesforce", "codet5")
print(
tokenizer.encode("<s> hello <unk> Don't you love 🤗 Transformers <mask> yes . </s>").tokens
)
|