File size: 1,261 Bytes
bcc039b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# Copyright (c) Meta Platforms, Inc. and affiliates.
import json

from bytelatent.constants import BLT_DATA
from bytelatent.tokenizers.blt_tokenizer import BltTokenizer
from bytelatent.tokenizers.build_tokenizer import TokenizerArgs


def test_tokenizer_bytes():
    with open("fixtures/tokenizer_data.json") as f:
        data = json.load(f)

    examples: list[str] = data["texts"]
    examples_tokens: list[list[int]] = data["tokens"]

    tokenizer = BltTokenizer(bpe_delim=False)
    for i in range(len(examples)):
        assert tokenizer.encode(examples[i]) == examples_tokens[i]


def test_tokenizer_bpe():
    with open("fixtures/tokenizer_data_bpe_delim.json") as f:
        data = json.load(f)

    examples: list[str] = data["texts"]
    examples_tokens: list[list[int]] = data["tokens"]

    tokenizer = BltTokenizer(bpe_delim=True)
    for i in range(len(examples)):
        assert tokenizer.encode(examples[i]) == examples_tokens[i]


def test_build_tokenizer_from_args():
    tokenizer_args = TokenizerArgs(
        name="blt",
        init_kwargs={
            "bpe_tokenizer_path": BLT_DATA / "tokenizer_final_32k.minus_inf_ws.model"
        },
    )
    tokenizer = tokenizer_args.build()
    assert tokenizer.encode("test text") is not None