ethanlshen commited on
Commit
c57b12c
·
verified ·
1 Parent(s): 86a0d38

Update superposed/llama/tokenizer.py

Browse files
Files changed (1) hide show
  1. superposed/llama/tokenizer.py +6 -5
superposed/llama/tokenizer.py CHANGED
@@ -35,7 +35,7 @@ class Tokenizer:
35
  )
36
  assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
37
 
38
- def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
39
  """
40
  Encodes a string into a list of token IDs.
41
 
@@ -47,15 +47,16 @@ class Tokenizer:
47
  Returns:
48
  List[int]: A list of token IDs.
49
  """
50
- assert type(s) is str
51
  t = self.sp_model.encode(s)
52
  if bos:
53
- t = [self.bos_id] + t
 
54
  if eos:
55
- t = t + [self.eos_id]
 
56
  return t
57
 
58
- def decode(self, t: List[int]) -> str:
59
  """
60
  Decodes a list of token IDs into a string.
61
 
 
35
  )
36
  assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
37
 
38
+ def encode(self, s, bos: bool, eos: bool):
39
  """
40
  Encodes a string into a list of token IDs.
41
 
 
47
  Returns:
48
  List[int]: A list of token IDs.
49
  """
 
50
  t = self.sp_model.encode(s)
51
  if bos:
52
+ for i in range(len(t)):
53
+ t[i] = [self.bos_id] + t[i]
54
  if eos:
55
+ for i in range(len(t)):
56
+ t[i] = t[i] + [self.eos_id]
57
  return t
58
 
59
+ def decode(self, t):
60
  """
61
  Decodes a list of token IDs into a string.
62