Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,261 Bytes
bcc039b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
# Copyright (c) Meta Platforms, Inc. and affiliates.
import json
from bytelatent.constants import BLT_DATA
from bytelatent.tokenizers.blt_tokenizer import BltTokenizer
from bytelatent.tokenizers.build_tokenizer import TokenizerArgs
def test_tokenizer_bytes():
with open("fixtures/tokenizer_data.json") as f:
data = json.load(f)
examples: list[str] = data["texts"]
examples_tokens: list[list[int]] = data["tokens"]
tokenizer = BltTokenizer(bpe_delim=False)
for i in range(len(examples)):
assert tokenizer.encode(examples[i]) == examples_tokens[i]
def test_tokenizer_bpe():
with open("fixtures/tokenizer_data_bpe_delim.json") as f:
data = json.load(f)
examples: list[str] = data["texts"]
examples_tokens: list[list[int]] = data["tokens"]
tokenizer = BltTokenizer(bpe_delim=True)
for i in range(len(examples)):
assert tokenizer.encode(examples[i]) == examples_tokens[i]
def test_build_tokenizer_from_args():
tokenizer_args = TokenizerArgs(
name="blt",
init_kwargs={
"bpe_tokenizer_path": BLT_DATA / "tokenizer_final_32k.minus_inf_ws.model"
},
)
tokenizer = tokenizer_args.build()
assert tokenizer.encode("test text") is not None
|