File size: 497 Bytes
b410583
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from tokenizers import ByteLevelBPETokenizer

paths = ['train_code.txt', 'train_doc.txt']

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=32000, min_frequency=3, special_tokens=[
    "<pad>",
    "<s>",
    "</s>",
    "<unk>",
    "<mask>"
])

# Save files to disk
tokenizer.save_model("./salesforce", "codet5")

print(
    tokenizer.encode("<s> hello <unk> Don't you love 🤗 Transformers <mask> yes . </s>").tokens
)