Spaces:
Running
on
Zero
Running
on
Zero
Add vocab and seq len abstract fields (#66)
Browse files
bytelatent/base_transformer.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
|
2 |
import logging
|
3 |
import os
|
4 |
from enum import Enum
|
@@ -572,7 +573,13 @@ class TransformerBlock(nn.Module):
|
|
572 |
self.ffn_norm.reset_parameters()
|
573 |
|
574 |
|
575 |
-
class
|
|
|
|
|
|
|
|
|
|
|
|
|
576 |
def __init__(self, args: BaseTransformerArgs):
|
577 |
super().__init__()
|
578 |
self.dim = args.dim
|
@@ -593,6 +600,9 @@ class BaseTransformer(nn.Module):
|
|
593 |
for _ in range(args.n_layers):
|
594 |
self.layers.append(TransformerBlock(args))
|
595 |
|
|
|
|
|
|
|
596 |
def forward(
|
597 |
self,
|
598 |
h,
|
|
|
1 |
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
import abc
|
3 |
import logging
|
4 |
import os
|
5 |
from enum import Enum
|
|
|
573 |
self.ffn_norm.reset_parameters()
|
574 |
|
575 |
|
576 |
+
class SequenceModelWithOutput(abc.ABC):
|
577 |
+
@abc.abstractmethod
|
578 |
+
def get_output_seq_len(self) -> int:
|
579 |
+
pass
|
580 |
+
|
581 |
+
|
582 |
+
class BaseTransformer(nn.Module, SequenceModelWithOutput):
|
583 |
def __init__(self, args: BaseTransformerArgs):
|
584 |
super().__init__()
|
585 |
self.dim = args.dim
|
|
|
600 |
for _ in range(args.n_layers):
|
601 |
self.layers.append(TransformerBlock(args))
|
602 |
|
603 |
+
def get_output_seq_len(self):
|
604 |
+
return self.max_seqlen
|
605 |
+
|
606 |
def forward(
|
607 |
self,
|
608 |
h,
|
bytelatent/model/blt.py
CHANGED
@@ -12,6 +12,7 @@ from typing_extensions import Self
|
|
12 |
from bytelatent.base_transformer import (
|
13 |
BaseTransformerArgs,
|
14 |
InitStdFactor,
|
|
|
15 |
TransformerBlock,
|
16 |
)
|
17 |
from bytelatent.data.patcher import Patcher, PatcherArgs
|
@@ -766,7 +767,7 @@ def compute_hash_embeddings(
|
|
766 |
return local_encoder_embeds
|
767 |
|
768 |
|
769 |
-
class ByteLatentTransformer(nn.Module):
|
770 |
"""
|
771 |
The ByteLatentTransformer (BLT) is a byte-level language model architecture that processes byte sequences
|
772 |
by dynamically segmenting them into patches. It uses a combination of local encoders, global transformers,
|
@@ -856,6 +857,9 @@ class ByteLatentTransformer(nn.Module):
|
|
856 |
)
|
857 |
)
|
858 |
|
|
|
|
|
|
|
859 |
def forward(
|
860 |
self,
|
861 |
tokens: torch.Tensor,
|
|
|
12 |
from bytelatent.base_transformer import (
|
13 |
BaseTransformerArgs,
|
14 |
InitStdFactor,
|
15 |
+
SequenceModelWithOutput,
|
16 |
TransformerBlock,
|
17 |
)
|
18 |
from bytelatent.data.patcher import Patcher, PatcherArgs
|
|
|
767 |
return local_encoder_embeds
|
768 |
|
769 |
|
770 |
+
class ByteLatentTransformer(nn.Module, SequenceModelWithOutput):
|
771 |
"""
|
772 |
The ByteLatentTransformer (BLT) is a byte-level language model architecture that processes byte sequences
|
773 |
by dynamically segmenting them into patches. It uses a combination of local encoders, global transformers,
|
|
|
857 |
)
|
858 |
)
|
859 |
|
860 |
+
def get_output_seq_len(self):
|
861 |
+
return self.max_seqlen
|
862 |
+
|
863 |
def forward(
|
864 |
self,
|
865 |
tokens: torch.Tensor,
|
bytelatent/tokenizers/abstract_tokenizer.py
CHANGED
@@ -17,3 +17,7 @@ class Tokenizer(abc.ABC):
|
|
17 |
) -> tuple[list[str], list[int]]:
|
18 |
"""Return the offsets of the tokens in the original text. Only used for evaluation."""
|
19 |
pass
|
|
|
|
|
|
|
|
|
|
17 |
) -> tuple[list[str], list[int]]:
|
18 |
"""Return the offsets of the tokens in the original text. Only used for evaluation."""
|
19 |
pass
|
20 |
+
|
21 |
+
@abc.abstractmethod
|
22 |
+
def get_vocab_size(self) -> int:
|
23 |
+
pass
|
bytelatent/tokenizers/blt_tokenizer.py
CHANGED
@@ -101,6 +101,9 @@ class BltTokenizer(Tokenizer):
|
|
101 |
self.vocab_size_unit_1 = vocab_size_unit_1
|
102 |
self.n_words = vocab_size_unit_1 + self.offsetting_special_char
|
103 |
|
|
|
|
|
|
|
104 |
def encode(
|
105 |
self, text: str, add_bos: bool | None = None, add_eos: bool | None = None
|
106 |
):
|
|
|
101 |
self.vocab_size_unit_1 = vocab_size_unit_1
|
102 |
self.n_words = vocab_size_unit_1 + self.offsetting_special_char
|
103 |
|
104 |
+
def get_vocab_size(self) -> int:
|
105 |
+
return self.n_words
|
106 |
+
|
107 |
def encode(
|
108 |
self, text: str, add_bos: bool | None = None, add_eos: bool | None = None
|
109 |
):
|
bytelatent/tokenizers/sentence_piece_tokenizer.py
CHANGED
@@ -35,6 +35,9 @@ class SentencePieceTokenizer(Tokenizer):
|
|
35 |
)
|
36 |
assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
|
37 |
|
|
|
|
|
|
|
38 |
def encode(self, s: str, add_bos: bool | None = None, add_eos: bool | None = None):
|
39 |
if add_bos is None:
|
40 |
add_bos = self.add_bos
|
|
|
35 |
)
|
36 |
assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
|
37 |
|
38 |
+
def get_vocab_size(self) -> int:
|
39 |
+
return self.n_words
|
40 |
+
|
41 |
def encode(self, s: str, add_bos: bool | None = None, add_eos: bool | None = None):
|
42 |
if add_bos is None:
|
43 |
add_bos = self.add_bos
|
bytelatent/tokenizers/tiktoken_tokenizer.py
CHANGED
@@ -53,6 +53,9 @@ class TikTokenTokenizer(Tokenizer):
|
|
53 |
f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
|
54 |
)
|
55 |
|
|
|
|
|
|
|
56 |
def encode(self, s: str, add_bos: bool, add_eos: bool):
|
57 |
assert isinstance(s, str)
|
58 |
|
|
|
53 |
f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
|
54 |
)
|
55 |
|
56 |
+
def get_vocab_size(self) -> int:
|
57 |
+
return self.n_words
|
58 |
+
|
59 |
def encode(self, s: str, add_bos: bool, add_eos: bool):
|
60 |
assert isinstance(s, str)
|
61 |
|