File size: 371 Bytes
b410583 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer.from_file(
"./salesforce/codet5-vocab.json",
"./salesforce/codet5-merges.txt"
)
tokenizer.add_special_tokens([
"<pad>",
"<s>",
"</s>",
"<unk>",
"<mask>"
])
print(
tokenizer.encode("<s> hello <unk> Don't you love 🤗 Transformers <mask> yes . </s>").tokens
) |