File size: 371 Bytes
b410583
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from tokenizers import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer.from_file(
    "./salesforce/codet5-vocab.json",
    "./salesforce/codet5-merges.txt"
)
tokenizer.add_special_tokens([
    "<pad>",
    "<s>",
    "</s>",
    "<unk>",
    "<mask>"
])

print(
    tokenizer.encode("<s> hello <unk> Don't you love 🤗 Transformers <mask> yes . </s>").tokens
)