Spaces:

nvidia
/

Plan2Align-NV

Sleeping

File size: 21,216 Bytes

2aebc50

// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.!

#include "builder.h"

#include <algorithm>
#include <functional>
#include <utility>

#include "filesystem.h"
#include "third_party/absl/strings/str_join.h"
#include "third_party/absl/strings/str_replace.h"
#include "third_party/absl/strings/str_split.h"
#include "third_party/absl/strings/strip.h"

#ifdef ENABLE_NFKC_COMPILE
#include <unicode/errorcode.h>
#include <unicode/locid.h>
#include <unicode/normlzr.h>
#include <unicode/numfmt.h>
#include <unicode/rbnf.h>
#include <unicode/utypes.h>
#endif  // ENABLE_NFKC_COMPILE

#include <set>

#include "normalization_rule.h"
#include "normalizer.h"
#include "third_party/darts_clone/darts.h"
#include "util.h"

namespace sentencepiece {
namespace normalizer {
namespace {

constexpr int kMaxUnicode = 0x10FFFF;

static constexpr absl::string_view kDefaultNormalizerName = "nfkc";

#ifndef ENABLE_NFKC_COMPILE
static constexpr absl::string_view kCompileError =
    "NFK compile is not enabled. rebuild with -DSPM_ENABLE_NFKC_COMPILE=ON";
#endif

#ifdef ENABLE_NFKC_COMPILE
// Normalize `input` with ICU's normalizer with `mode`.
Builder::Chars UnicodeNormalize(UNormalizationMode mode,
                                const Builder::Chars &input) {
  const std::string utf8 = string_util::UnicodeTextToUTF8(input);
  CHECK(!utf8.empty());

  icu::UnicodeString ustr = icu::UnicodeString::fromUTF8(utf8.c_str());

  UErrorCode status = U_ZERO_ERROR;
  icu::UnicodeString dst;
  icu::Normalizer::normalize(ustr, mode, 0, dst, status);
  CHECK(U_SUCCESS(status));
  std::string normalized;
  normalized.reserve(dst.length() * 3);
  dst.toUTF8String(normalized);
  return string_util::UTF8ToUnicodeText(normalized);
}

Builder::Chars ToNFKD(const Builder::Chars &input) {
  return UnicodeNormalize(UNORM_NFKD, input);
}

Builder::Chars ToNFKC(const Builder::Chars &input) {
  return UnicodeNormalize(UNORM_NFKC, input);
}

Builder::Chars ToNFC(const Builder::Chars &input) {
  return UnicodeNormalize(UNORM_NFC, input);
}

Builder::Chars ToNFD(const Builder::Chars &input) {
  return UnicodeNormalize(UNORM_NFD, input);
}

// Given an NFKD-normalized string, returns a set of all strings which are
// normalized into the same `nfkd`. `norm2orig` is the normalized to
// un-normalized character mapping.
std::vector<Builder::Chars> ExpandUnnormalized(
    const Builder::Chars &nfkd,
    const std::map<char32, std::set<char32>> &norm2orig) {
  CHECK(!nfkd.empty());
  std::vector<Builder::Chars> results;
  for (const auto c : port::FindOrDie(norm2orig, nfkd[0])) {
    results.push_back({c});
  }
  for (size_t i = 1; i < nfkd.size(); ++i) {
    const auto &orig = port::FindOrDie(norm2orig, nfkd[i]);
    std::vector<Builder::Chars> new_results;
    for (const auto &r : results) {
      for (const auto c : orig) {
        new_results.emplace_back(r);
        new_results.back().push_back(c);
      }
    }
    results = std::move(new_results);
  }
  CHECK_EQ(nfkd.size(), results[0].size());
  return results;
}
#endif  // ENABLE_NFKC_COMPILE

// Normalizes `src` with `chars_map` and returns normalized Chars.
// `max_len` specifies the maximum length of the key in `chars_map`.
Builder::Chars Normalize(const Builder::CharsMap &chars_map,
                         const Builder::Chars &src, int max_len) {
  CHECK_GE(max_len, 1);
  Builder::Chars normalized;

  for (size_t i = 0; i < src.size();) {
    Builder::CharsMap::const_iterator it = chars_map.end();
    const size_t slice = std::min<size_t>(i + max_len, src.size());
    // starts with the longest prefix.
    Builder::Chars key(src.begin() + i, src.begin() + slice);
    while (!key.empty()) {
      it = chars_map.find(key);
      if (it != chars_map.end()) {
        break;
      }
      key.pop_back();  // remove the last character.
    }

    // Consumes one character when no rule is found.
    if (it == chars_map.end()) {
      normalized.push_back(src[i]);
      ++i;
    } else {
      std::copy(it->second.begin(), it->second.end(),
                std::back_inserter(normalized));
      i += it->first.size();
    }
  }

  return normalized;
}
}  // namespace

// static
util::Status Builder::CompileCharsMap(const CharsMap &chars_map,
                                      std::string *output) {
  CHECK_OR_RETURN(output);
  CHECK_OR_RETURN(!chars_map.empty());

  LOG(INFO) << "Loading CharsMap of size=" << chars_map.size();

  // Aggregates the same target strings to save footprint.
  std::map<Chars, int> normalized2pos;
  for (const auto &p : chars_map) {
    normalized2pos[p.second] = 0;
  }

  std::string normalized;
  for (auto &p : normalized2pos) {
    p.second = normalized.size();  // stores the pointer (position).
    const std::string utf8_out = string_util::UnicodeTextToUTF8(p.first);
    CHECK_OR_RETURN(string_util::IsStructurallyValid(utf8_out));
    normalized += utf8_out;
    normalized += '\0';
  }

  std::vector<std::pair<std::string, int>> kv;  // key-value of Trie.
  for (const auto &p : chars_map) {
    // The value of Trie stores the pointer to the normalized string.
    const std::string utf8_in = string_util::UnicodeTextToUTF8(p.first);
    CHECK_OR_RETURN(!utf8_in.empty());
    CHECK_OR_RETURN(string_util::IsStructurallyValid(utf8_in));
    kv.emplace_back(utf8_in, port::FindOrDie(normalized2pos, p.second));
  }

  std::sort(kv.begin(), kv.end());
  std::vector<const char *> key(kv.size());
  std::vector<int> value(kv.size());
  for (size_t i = 0; i < kv.size(); ++i) {
    key[i] = kv[i].first.c_str();
    value[i] = kv[i].second;
  }

  Darts::DoubleArray trie;
  CHECK_EQ_OR_RETURN(0, trie.build(key.size(), const_cast<char **>(&key[0]),
                                   nullptr, &value[0]))
      << "cannot build double-array";

  int max_nodes_size = 0;
  std::vector<Darts::DoubleArray::result_pair_type> results(
      2 * Normalizer::kMaxTrieResultsSize);
  for (const char *str : key) {
    const int num_nodes = trie.commonPrefixSearch(str, results.data(),
                                                  results.size(), strlen(str));
    max_nodes_size = std::max(num_nodes, max_nodes_size);
  }
  CHECK_LT_OR_RETURN(max_nodes_size, Normalizer::kMaxTrieResultsSize)
      << "This charmaps contain many shared prefix. "
      << "The number of shared prefix must be less than "
      << Normalizer::kMaxTrieResultsSize;

  absl::string_view trie_blob(static_cast<const char *>(trie.array()),
                              trie.size() * trie.unit_size());
  *output = Normalizer::EncodePrecompiledCharsMap(trie_blob, normalized);

  LOG(INFO) << "Generated normalizer blob. size=" << output->size();

  return util::OkStatus();
}

// static
util::Status Builder::DecompileCharsMap(absl::string_view blob,
                                        Builder::CharsMap *chars_map) {
  CHECK_OR_RETURN(chars_map);
  chars_map->clear();

  absl::string_view trie_blob, normalized;
  std::string buf;
  RETURN_IF_ERROR(Normalizer::DecodePrecompiledCharsMap(blob, &trie_blob,
                                                        &normalized, &buf));

  Darts::DoubleArray trie;
  trie.set_array(const_cast<char *>(trie_blob.data()),
                 trie_blob.size() / trie.unit_size());

  std::string key;
  std::function<void(size_t, size_t)> traverse;

  // Given a Trie node at `node_pos` and the key position at `key_position`,
  // Expands children nodes from `node_pos`.
  // When leaf nodes are found, stores them into `chars_map`.
  traverse = [&traverse, &key, &trie, &normalized, &chars_map](
                 size_t node_pos, size_t key_pos) -> void {
    for (int c = 0; c <= 255; ++c) {
      key.push_back(static_cast<char>(c));
      size_t copied_node_pos = node_pos;
      size_t copied_key_pos = key_pos;
      // Note: `copied_(node|key)_pos` are non-const references.
      // They store the new positions after node traversal.
      const Darts::DoubleArray::result_type result = trie.traverse(
          key.data(), copied_node_pos, copied_key_pos, key.size());
      if (result >= -1) {   // node exists.
        if (result >= 0) {  // has a value after transition.
          const absl::string_view value = normalized.data() + result;
          Chars key_chars, value_chars;
          for (const auto c : string_util::UTF8ToUnicodeText(key))
            key_chars.push_back(c);
          for (const auto c : string_util::UTF8ToUnicodeText(value))
            value_chars.push_back(c);
          (*chars_map)[key_chars] = value_chars;
        }
        // Recursively traverse.
        traverse(copied_node_pos, copied_key_pos);
      }
      key.pop_back();
    }
  };

  traverse(0, 0);

  return util::OkStatus();
}

// static
util::Status Builder::GetPrecompiledCharsMap(absl::string_view name,
                                             std::string *output) {
  CHECK_OR_RETURN(output);

  if (name == "identity") {
    output->clear();
    return util::OkStatus();
  }

  std::string result;
  for (size_t i = 0; i < kNormalizationRules_size; ++i) {
    const auto *blob = &kNormalizationRules_blob[i];
    if (blob->name == name) {
      output->assign(blob->data, blob->size);
      return util::OkStatus();
    }
  }
  return util::StatusBuilder(util::StatusCode::kNotFound, GTL_LOC)
         << "No precompiled charsmap is found: " << name;
}

#ifdef ENABLE_NFKC_COMPILE
namespace {
util::Status BuildMapInternal(
    Builder::CharsMap *chars_map,
    std::function<Builder::Chars(const Builder::Chars &)> composer,
    std::function<Builder::Chars(const Builder::Chars &)> decomposer) {
#ifdef ENABLE_NFKC_COMPILE
  // Set of fully NFKD decomposed characters.
  std::set<Builder::Chars> nfkd_decomposed;

  // Fully normalized one character to unnormalized one character map.
  std::map<char32, std::set<char32>> norm2orig;

  Builder::CharsMap nfkc_map;  // The final NFKC mapping.

  constexpr int kMaxUnicode = 0x10FFFF;
  for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
    if (!U_IS_UNICODE_CHAR(cp)) {
      continue;
    }
    // Aggregates single character to fully NFKC normalized characters.
    const auto nfkc = composer({cp});
    if (nfkc.size() >= 2 || (nfkc.size() == 1 && nfkc[0] != cp)) {
      nfkc_map[{cp}] = nfkc;
    }
    const auto nfkd = decomposer({cp});
    if (nfkd.size() == 1) {
      // Aggregates reverse mapping from normalized to unnormalized character.
      norm2orig[nfkd[0]].insert(cp);
    } else {
      // One character is decomposed into multiple characters.
      nfkd_decomposed.insert(nfkd);
    }
  }

  for (const auto &nfkd : nfkd_decomposed) {
    const auto nfkc = composer(nfkd);
    // This case is already covered by single-character to NFKC mapping.
    if (nfkc == nfkd) {
      continue;
    }
    // Expand all possible sequences which are normalized into the same
    // `nfkd`.
    for (const auto &nfkd_orig : ExpandUnnormalized(nfkd, norm2orig)) {
      if (nfkd_orig != nfkc) {
        nfkc_map[nfkd_orig] = nfkc;
      }
    }
  }

  RETURN_IF_ERROR(Builder::RemoveRedundantMap(&nfkc_map));
  *chars_map = std::move(nfkc_map);
#endif  // ENABLE_NFKC_COMPILE
  return util::OkStatus();
}
}  // namespace
#endif  // ENABLE_NFKC_COMPILE

// static
util::Status Builder::BuildNFKCMap(CharsMap *chars_map) {
#ifdef ENABLE_NFKC_COMPILE
  LOG(INFO) << "Running BuildNFKCMap";
  BuildMapInternal(chars_map, ToNFKC, ToNFKD);
#else
  LOG(ERROR) << kCompileError;
#endif

  return util::OkStatus();
}

// static
util::Status Builder::BuildNFCMap(CharsMap *chars_map) {
#ifdef ENABLE_NFKC_COMPILE
  LOG(INFO) << "Running BuildNFCMap";
  BuildMapInternal(chars_map, ToNFC, ToNFD);
#else
  LOG(ERROR) << kCompileError;
#endif
  return util::OkStatus();
}

util::Status Builder::BuildNmtNFKCMap(CharsMap *chars_map) {
#ifdef ENABLE_NFKC_COMPILE
  LOG(INFO) << "Running BuildNmtNFKCMap";

  CharsMap nfkc_map;
  RETURN_IF_ERROR(BuildNFKCMap(&nfkc_map));
  RETURN_IF_ERROR(MergeNmtMap(&nfkc_map));
  RETURN_IF_ERROR(RemoveRedundantMap(&nfkc_map));

  *chars_map = std::move(nfkc_map);
#else
  LOG(ERROR) << kCompileError;
#endif

  return util::OkStatus();
}

// static
util::Status Builder::MergeUnicodeCaseFoldMap(Builder::CharsMap *chars_map) {
#ifdef ENABLE_NFKC_COMPILE
  for (auto &c : *chars_map) {
    std::vector<char32> trg;
    for (char32 c : c.second) trg.push_back(u_foldCase(c, U_FOLD_CASE_DEFAULT));
    c.second = trg;
  }

  constexpr int kMaxUnicode = 0x10FFFF;
  for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
    if (!U_IS_UNICODE_CHAR(cp)) {
      continue;
    }
    if (chars_map->find({cp}) != chars_map->end()) continue;
    const char32 trg = u_foldCase(cp, U_FOLD_CASE_DEFAULT);
    if (trg != cp) (*chars_map)[{cp}] = {trg};
  }

  RETURN_IF_ERROR(RemoveRedundantMap(chars_map));
#endif

  return util::OkStatus();
}

// static
util::Status Builder::MergeNmtMap(Builder::CharsMap *chars_map) {
  // Other code points considered as whitespace.
  (*chars_map)[{0x0009}] = {0x20};  // TAB
  (*chars_map)[{0x000A}] = {0x20};  // LINE FEED
  (*chars_map)[{0x000C}] = {0x20};  // FORM FEED
  (*chars_map)[{0x000D}] = {0x20};  // CARRIAGE RETURN
  (*chars_map)[{0x1680}] = {0x20};  // OGHAM SPACE MARK
  (*chars_map)[{0x200B}] = {0x20};  // ZERO WIDTH SPACE
  (*chars_map)[{0x200E}] = {0x20};  // LEFT-TO-RIGHT MARK
  (*chars_map)[{0x200F}] = {0x20};  // RIGHT-TO-LEFT MARK
  (*chars_map)[{0x2028}] = {0x20};  // LINE SEPARATOR
  (*chars_map)[{0x2029}] = {0x20};  // PARAGRAPH SEPARATOR
  (*chars_map)[{0x2581}] = {0x20};  // LOWER ONE EIGHT BLOCK
  (*chars_map)[{0xFEFF}] = {0x20};  // ZERO WIDTH NO-BREAK
  (*chars_map)[{0xFFFD}] = {0x20};  // REPLACEMENT CHARACTER
  (*chars_map)[{0x200C}] = {0x20};  // ZERO WIDTH NON-JOINER
  //  (*chars_map)[{0x200D}] = {0x20};  // ZERO WIDTH JOINER

  // Ascii Control characters
  (*chars_map)[{0x0001}] = {};
  (*chars_map)[{0x0002}] = {};
  (*chars_map)[{0x0003}] = {};
  (*chars_map)[{0x0004}] = {};
  (*chars_map)[{0x0005}] = {};
  (*chars_map)[{0x0006}] = {};
  (*chars_map)[{0x0007}] = {};
  (*chars_map)[{0x0008}] = {};
  (*chars_map)[{0x000B}] = {};
  (*chars_map)[{0x000E}] = {};
  (*chars_map)[{0x000F}] = {};
  (*chars_map)[{0x0010}] = {};
  (*chars_map)[{0x0011}] = {};
  (*chars_map)[{0x0012}] = {};
  (*chars_map)[{0x0013}] = {};
  (*chars_map)[{0x0014}] = {};
  (*chars_map)[{0x0015}] = {};
  (*chars_map)[{0x0016}] = {};
  (*chars_map)[{0x0017}] = {};
  (*chars_map)[{0x0018}] = {};
  (*chars_map)[{0x0019}] = {};
  (*chars_map)[{0x001A}] = {};
  (*chars_map)[{0x001B}] = {};
  (*chars_map)[{0x001C}] = {};
  (*chars_map)[{0x001D}] = {};
  (*chars_map)[{0x001E}] = {};
  (*chars_map)[{0x001F}] = {};

  //  <control-007F>..<control-009F>
  (*chars_map)[{0x007F}] = {};
  (*chars_map)[{0x008F}] = {};
  (*chars_map)[{0x009F}] = {};

  // Do not normalize FULL_WIDTH TILDE, since FULL_WIDTH TILDE
  // and HALF_WIDTH TILDE are used differently in Japanese.
  (*chars_map).erase({0xFF5E});

  return util::OkStatus();
}

// static
util::Status Builder::BuildNFKC_CFMap(CharsMap *chars_map) {
#ifdef ENABLE_NFKC_COMPILE
  CharsMap nfkc_map;
  RETURN_IF_ERROR(Builder::BuildNFKCMap(&nfkc_map));
  RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfkc_map));
  *chars_map = std::move(nfkc_map);
#else
  LOG(ERROR) << kCompileError;
#endif

  return util::OkStatus();
}

//  static
util::Status Builder::BuildNmtNFKC_CFMap(CharsMap *chars_map) {
#ifdef ENABLE_NFKC_COMPILE
  CharsMap nfkc_map;
  RETURN_IF_ERROR(Builder::BuildNmtNFKCMap(&nfkc_map));
  RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfkc_map));
  *chars_map = std::move(nfkc_map);
#else
  LOG(ERROR) << kCompileError;
#endif

  return util::OkStatus();
}

// static
util::Status Builder::BuildNFKDMap(CharsMap *chars_map) {
#ifdef ENABLE_NFKC_COMPILE
  constexpr int kMaxUnicode = 0x10FFFF;
  for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
    if (!U_IS_UNICODE_CHAR(cp)) {
      continue;
    }
    const auto nfkd = ToNFKD({cp});
    if (nfkd.size() >= 2 || (nfkd.size() == 1 && nfkd[0] != cp)) {
      (*chars_map)[{cp}] = nfkd;
    }
  }
#else
  LOG(ERROR) << kCompileError;
#endif
  return util::OkStatus();
}

// static
util::Status Builder::BuildNFDMap(CharsMap *chars_map) {
#ifdef ENABLE_NFKC_COMPILE
  constexpr int kMaxUnicode = 0x10FFFF;
  for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
    if (!U_IS_UNICODE_CHAR(cp)) {
      continue;
    }
    const auto nfd = ToNFD({cp});
    if (nfd.size() >= 2 || (nfd.size() == 1 && nfd[0] != cp)) {
      (*chars_map)[{cp}] = nfd;
    }
  }

#else
  LOG(ERROR) << kCompileError;
#endif
  return util::OkStatus();
}

// static
util::Status Builder::BuildNFKD_CFMap(CharsMap *chars_map) {
#ifdef ENABLE_NFKC_COMPILE
  CharsMap nfkd_map;
  RETURN_IF_ERROR(Builder::BuildNFKDMap(&nfkd_map));
  RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfkd_map));
  *chars_map = std::move(nfkd_map);
#else
  LOG(ERROR) << kCompileError;
#endif
  return util::OkStatus();
}

// static
util::Status Builder::BuildNFC_CFMap(CharsMap *chars_map) {
#ifdef ENABLE_NFKC_COMPILE
  CharsMap nfc_map;
  RETURN_IF_ERROR(Builder::BuildNFKDMap(&nfc_map));
  RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfc_map));
  *chars_map = std::move(nfc_map);
#else
  LOG(ERROR) << kCompileError;
#endif
  return util::OkStatus();
}

// static
util::Status Builder::BuildNFD_CFMap(CharsMap *chars_map) {
#ifdef ENABLE_NFKC_COMPILE
  CharsMap nfd_map;
  RETURN_IF_ERROR(Builder::BuildNFDMap(&nfd_map));
  RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfd_map));
  *chars_map = std::move(nfd_map);
#else
  LOG(ERROR) << kCompileError;
#endif
  return util::OkStatus();
}

// static
util::Status Builder::LoadCharsMap(absl::string_view filename,
                                   CharsMap *chars_map) {
  LOG(INFO) << "Loading mapping file: " << filename.data();
  CHECK_OR_RETURN(chars_map);

  auto input = filesystem::NewReadableFile(filename);

  RETURN_IF_ERROR(input->status());

  std::string line;
  chars_map->clear();
  while (input->ReadLine(&line)) {
    std::vector<std::string> fields =
        absl::StrSplit(line, '\t', absl::AllowEmpty());
    CHECK_GE(fields.size(), 1);
    if (fields.size() == 1) fields.push_back("");  // Deletion rule.
    std::vector<char32> src, trg;
    for (auto s : absl::StrSplit(fields[0], ' ')) {
      if (s.empty()) continue;
      absl::ConsumePrefix(&s, "U+");
      src.push_back(string_util::HexToInt<char32>(s));
    }
    for (auto s : absl::StrSplit(fields[1], ' ')) {
      if (s.empty()) continue;
      absl::ConsumePrefix(&s, "U+");
      trg.push_back(string_util::HexToInt<char32>(s));
    }
    CHECK_OR_RETURN(!src.empty());
    (*chars_map)[src] = trg;
  }

  return util::OkStatus();
}

// static
util::Status Builder::SaveCharsMap(absl::string_view filename,
                                   const Builder::CharsMap &chars_map) {
  auto output = filesystem::NewWritableFile(filename);
  RETURN_IF_ERROR(output->status());

  for (const auto &c : chars_map) {
    std::vector<std::string> src, trg;
    string_util::UnicodeText srcu, trgu;
    for (char32 v : c.first) {
      src.push_back(string_util::IntToHex(v));
      srcu.push_back(v);
    }
    for (char32 v : c.second) {
      trg.push_back(string_util::IntToHex(v));
      trgu.push_back(v);
    }
    std::string line = absl::StrJoin(src, " ") + "\t" +
                       absl::StrJoin(trg, " ") + "\t# " +
                       string_util::UnicodeTextToUTF8(c.first) + " => " +
                       string_util::UnicodeTextToUTF8(c.second);
    line = absl::StrReplaceAll(
        line,
        {{"\b", " "}, {"\v", " "}, {"\f", " "}, {"\n", " "}, {"\r", " "}});
    output->WriteLine(line);
  }

  return util::OkStatus();
}

// static
util::Status Builder::RemoveRedundantMap(CharsMap *chars_map) {
  CHECK_OR_RETURN(chars_map);

  CharsMap new_chars_map;
  size_t max_len = 0;
  for (const auto &p : *chars_map) {
    max_len = std::max(p.first.size(), max_len);
    if (p.first.size() == 1) {
      new_chars_map.insert(p);
    }
  }
  CHECK_GT_OR_RETURN(max_len, 0);

  // Checks whether the rules with size of `len` can be normalized by
  // the rules with size of [1 .. len - 1].
  for (size_t len = 2; len <= max_len; ++len) {
    for (const auto &p : *chars_map) {
      if (p.first.size() == len &&
          p.second != Normalize(new_chars_map, p.first, len - 1)) {
        new_chars_map.insert(p);
      }
    }
  }

  // Verify all characters in `chars_map` are normalized by `new_chars_map`.
  for (const auto &p : *chars_map) {
    CHECK_EQ_OR_RETURN(p.second, Normalize(new_chars_map, p.first, max_len));
  }

  *chars_map = std::move(new_chars_map);

  return util::OkStatus();
}
}  // namespace normalizer
}  // namespace sentencepiece