Spaces:
Running
on
Zero
Running
on
Zero
File size: 601 Bytes
bcc039b ff36aa8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
# Copyright (c) Meta Platforms, Inc. and affiliates.
import abc
class Tokenizer(abc.ABC):
@abc.abstractmethod
def encode(self, text: str, add_bos: bool, add_eos: bool):
pass
@abc.abstractmethod
def decode(self, tokens: list[int]):
pass
@abc.abstractmethod
def get_token_offsets(
self, text: str, tokens: list[int] | None = None
) -> tuple[list[str], list[int]]:
"""Return the offsets of the tokens in the original text. Only used for evaluation."""
pass
@abc.abstractmethod
def get_vocab_size(self) -> int:
pass
|