Spaces:

kishkath
/

bpe-tokenizer

Sleeping

App Files Files Community

kishkath commited on Jan 15

Commit

5740893

verified ·

1 Parent(s): ad067b3

Upload 2 files

Browse files

Files changed (2) hide show

tokenizers/basic.py +73 -0
tokenizers/bpe.py +164 -0

tokenizers/basic.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from tokenizers.bpe import Tokenizer, get_stats, merge
+import json
+class BasicTokenizer(Tokenizer):
+    def __init__(self):
+        super().__init__()
+        self.token_to_id = {}
+        self.id_to_token = {}
+    def train(self, text, vocab_size, verbose=False):
+        assert vocab_size >= 256 and vocab_size <= 5000
+        num_merges = vocab_size - 256
+        text_bytes = text.encode('utf-8')
+        ids = list(text_bytes)
+        merges = {}
+        vocab = {idx: bytes([idx]) for idx in range(256)}
+        # Initialize token mappings
+        self.token_to_id = {bytes([idx]).decode('utf-8', errors='replace'): idx for idx in range(256)}
+        self.id_to_token = {idx: bytes([idx]).decode('utf-8', errors='replace') for idx in range(256)}
+        for i in range(num_merges):
+            stats = get_stats(ids)
+            if not stats:
+                break
+            pair = max(stats, key=stats.get)
+            idx = 256 + i
+            ids = merge(ids, pair, idx)
+            merges[pair] = idx
+            vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
+            # Update token mappings
+            token = vocab[idx].decode('utf-8', errors='replace')
+            self.token_to_id[token] = idx
+            self.id_to_token[idx] = token
+            if verbose and i % 20 == 0:
+                print(f"merge {i + 1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences")
+        self.merges = merges
+        self.vocab = vocab
+        self._save_vocabulary()
+    def _save_vocabulary(self):
+        vocabulary = {
+            'token_to_id': self.token_to_id,
+            'id_to_token': self.id_to_token,
+            'merges': {f"{k[0]},{k[1]}": v for k, v in self.merges.items()}
+        }
+        with open('vocabulary.json', 'w', encoding='utf-8') as f:
+            json.dump(vocabulary, f, ensure_ascii=False, indent=2)
+    def decode(self, ids):
+        text_bytes = b"".join(self.vocab[idx] for idx in ids)
+        text = text_bytes.decode("utf-8", errors="replace")
+        return text
+    def encode(self, text):
+        text_bytes = text.encode("utf-8")
+        ids = list(text_bytes)
+        while len(ids) >= 2:
+            stats = get_stats(ids)
+            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
+            if pair not in self.merges:
+                break
+            idx = self.merges[pair]
+            ids = merge(ids, pair, idx)
+        return ids

tokenizers/bpe.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""
+Contains the base Tokenizer class and a few common helper functions.
+The base class also contains the (common) save/load functionality.
+It would be possible to be a lot more strict about the interface and
+e.g. isolating all regex/pattern parts to the RegexTokenizer, but
+some concessions are made for simplicity.
+"""
+import unicodedata
+# -----------------------------------------------------------------------------
+# a few helper functions useful for both BasicTokenizer and RegexTokenizer
+def get_stats(ids, counts=None):
+    """
+    Given a list of integers, return a dictionary of counts of consecutive pairs
+    Example: [1, 2, 3, 1, 2] -> {(1, 2): 2, (2, 3): 1, (3, 1): 1}
+    Optionally allows to update an existing dictionary of counts
+    """
+    counts = {} if counts is None else counts
+    for pair in zip(ids, ids[1:]): # iterate consecutive elements
+        counts[pair] = counts.get(pair, 0) + 1
+    return counts
+def merge(ids, pair, idx):
+    """
+    In the list of integers (ids), replace all consecutive occurrences
+    of pair with the new integer token idx
+    Example: ids=[1, 2, 3, 1, 2], pair=(1, 2), idx=4 -> [4, 3, 4]
+    """
+    newids = []
+    i = 0
+    while i < len(ids):
+        # if not at the very last position AND the pair matches, replace it
+        if ids[i] == pair[0] and i < len(ids) - 1 and ids[i+1] == pair[1]:
+            newids.append(idx)
+            i += 2
+        else:
+            newids.append(ids[i])
+            i += 1
+    return newids
+# first two helper functions...
+def replace_control_characters(s: str) -> str:
+    # we don't want to print control characters
+    # which distort the output (e.g. \n or much worse)
+    # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python/19016117#19016117
+    # http://www.unicode.org/reports/tr44/#GC_Values_Table
+    chars = []
+    for ch in s:
+        if unicodedata.category(ch)[0] != "C":
+            chars.append(ch) # this character is ok
+        else:
+            chars.append(f"\\u{ord(ch):04x}") # escape
+    return "".join(chars)
+def render_token(t: bytes) -> str:
+    # pretty print a token, escaping control characters
+    s = t.decode('utf-8', errors='replace')
+    s = replace_control_characters(s)
+    return s
+# -----------------------------------------------------------------------------
+# the base Tokenizer class
+class Tokenizer:
+    """Base class for Tokenizers"""
+    def __init__(self):
+        # default: vocab size of 256 (all bytes), no merges, no patterns
+        self.merges = {} # (int, int) -> int
+        self.pattern = "" # str
+        self.special_tokens = {} # str -> int, e.g. {'<|endoftext|>': 100257}
+        self.vocab = self._build_vocab() # int -> bytes
+    def train(self, text, vocab_size, verbose=False):
+        # Tokenizer can train a vocabulary of size vocab_size from text
+        raise NotImplementedError
+    def encode(self, text):
+        # Tokenizer can encode a string into a list of integers
+        raise NotImplementedError
+    def decode(self, ids):
+        # Tokenizer can decode a list of integers into a string
+        raise NotImplementedError
+    def _build_vocab(self):
+        # vocab is simply and deterministically derived from merges
+        vocab = {idx: bytes([idx]) for idx in range(256)}
+        for (p0, p1), idx in self.merges.items():
+            vocab[idx] = vocab[p0] + vocab[p1]
+        for special, idx in self.special_tokens.items():
+            vocab[idx] = special.encode("utf-8")
+        return vocab
+    def save(self, file_prefix):
+        """
+        Saves two files: file_prefix.vocab and file_prefix.model
+        This is inspired (but not equivalent to!) sentencepiece's model saving:
+        - model file is the critical one, intended for load()
+        - vocab file is just a pretty printed version for human inspection only
+        """
+        # write the model: to be used in load() later
+        model_file = file_prefix + ".model"
+        with open(model_file, 'w') as f:
+            # write the version, pattern and merges, that's all that's needed
+            f.write("minbpe v1\n")
+            f.write(f"{self.pattern}\n")
+            # write the special tokens, first the number of them, then each one
+            f.write(f"{len(self.special_tokens)}\n")
+            for special, idx in self.special_tokens.items():
+                f.write(f"{special} {idx}\n")
+            # the merges dict
+            for idx1, idx2 in self.merges:
+                f.write(f"{idx1} {idx2}\n")
+        # write the vocab: for the human to look at
+        vocab_file = file_prefix + ".vocab"
+        inverted_merges = {idx: pair for pair, idx in self.merges.items()}
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            for idx, token in self.vocab.items():
+                # note: many tokens may be partial utf-8 sequences
+                # and cannot be decoded into valid strings. Here we're using
+                # errors='replace' to replace them with the replacement char �.
+                # this also means that we couldn't possibly use .vocab in load()
+                # because decoding in this way is a lossy operation!
+                s = render_token(token)
+                # find the children of this token, if any
+                if idx in inverted_merges:
+                    # if this token has children, render it nicely as a merge
+                    idx0, idx1 = inverted_merges[idx]
+                    f.write(f"{s}\n")
+                else:
+                    # otherwise this is leaf token, just print it
+                    # (this should just be the first 256 tokens, the bytes)
+                    f.write(f"{s}\n")
+    def load(self, model_file):
+        """Inverse of save() but only for the model file"""
+        assert model_file.endswith(".model")
+        # read the model file
+        merges = {}
+        special_tokens = {}
+        idx = 256
+        with open(model_file, 'r', encoding="utf-8") as f:
+            # read the version
+            version = f.readline().strip()
+            assert version == "minbpe v1"
+            # read the pattern
+            self.pattern = f.readline().strip()
+            # read the special tokens
+            num_special = int(f.readline().strip())
+            for _ in range(num_special):
+                special, special_idx = f.readline().strip().split()
+                special_tokens[special] = int(special_idx)
+            # read the merges
+            for line in f:
+                idx1, idx2 = map(int, line.split())
+                merges[(idx1, idx2)] = idx
+                idx += 1
+        self.merges = merges
+        print('lenmerges: ', len(self.merges))
+        self.special_tokens = special_tokens
+        self.vocab = self._build_vocab()