Spaces:

nvidia
/

Plan2Align-NV

Sleeping

Plan2Align-NV / laser /tools-external /sentencepiece-master /src /builder.cc

KuangDW

add laser tool

2aebc50 26 days ago

21.2 kB

	// Copyright 2016 Google Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.!

	#include "builder.h"

	#include <algorithm>
	#include <functional>
	#include <utility>

	#include "filesystem.h"
	#include "third_party/absl/strings/str_join.h"
	#include "third_party/absl/strings/str_replace.h"
	#include "third_party/absl/strings/str_split.h"
	#include "third_party/absl/strings/strip.h"

	#ifdef ENABLE_NFKC_COMPILE
	#include <unicode/errorcode.h>
	#include <unicode/locid.h>
	#include <unicode/normlzr.h>
	#include <unicode/numfmt.h>
	#include <unicode/rbnf.h>
	#include <unicode/utypes.h>
	#endif // ENABLE_NFKC_COMPILE

	#include <set>

	#include "normalization_rule.h"
	#include "normalizer.h"
	#include "third_party/darts_clone/darts.h"
	#include "util.h"

	namespace sentencepiece {
	namespace normalizer {
	namespace {

	constexpr int kMaxUnicode = 0x10FFFF;

	static constexpr absl::string_view kDefaultNormalizerName = "nfkc";

	#ifndef ENABLE_NFKC_COMPILE
	static constexpr absl::string_view kCompileError =
	"NFK compile is not enabled. rebuild with -DSPM_ENABLE_NFKC_COMPILE=ON";
	#endif

	#ifdef ENABLE_NFKC_COMPILE
	// Normalize `input` with ICU's normalizer with `mode`.
	Builder::Chars UnicodeNormalize(UNormalizationMode mode,
	const Builder::Chars &input) {
	const std::string utf8 = string_util::UnicodeTextToUTF8(input);
	CHECK(!utf8.empty());

	icu::UnicodeString ustr = icu::UnicodeString::fromUTF8(utf8.c_str());

	UErrorCode status = U_ZERO_ERROR;
	icu::UnicodeString dst;
	icu::Normalizer::normalize(ustr, mode, 0, dst, status);
	CHECK(U_SUCCESS(status));
	std::string normalized;
	normalized.reserve(dst.length() * 3);
	dst.toUTF8String(normalized);
	return string_util::UTF8ToUnicodeText(normalized);
	}

	Builder::Chars ToNFKD(const Builder::Chars &input) {
	return UnicodeNormalize(UNORM_NFKD, input);
	}

	Builder::Chars ToNFKC(const Builder::Chars &input) {
	return UnicodeNormalize(UNORM_NFKC, input);
	}

	Builder::Chars ToNFC(const Builder::Chars &input) {
	return UnicodeNormalize(UNORM_NFC, input);
	}

	Builder::Chars ToNFD(const Builder::Chars &input) {
	return UnicodeNormalize(UNORM_NFD, input);
	}

	// Given an NFKD-normalized string, returns a set of all strings which are
	// normalized into the same `nfkd`. `norm2orig` is the normalized to
	// un-normalized character mapping.
	std::vector<Builder::Chars> ExpandUnnormalized(
	const Builder::Chars &nfkd,
	const std::map<char32, std::set<char32>> &norm2orig) {
	CHECK(!nfkd.empty());
	std::vector<Builder::Chars> results;
	for (const auto c : port::FindOrDie(norm2orig, nfkd[0])) {
	results.push_back({c});
	}
	for (size_t i = 1; i < nfkd.size(); ++i) {
	const auto &orig = port::FindOrDie(norm2orig, nfkd[i]);
	std::vector<Builder::Chars> new_results;
	for (const auto &r : results) {
	for (const auto c : orig) {
	new_results.emplace_back(r);
	new_results.back().push_back(c);
	}
	}
	results = std::move(new_results);
	}
	CHECK_EQ(nfkd.size(), results[0].size());
	return results;
	}
	#endif // ENABLE_NFKC_COMPILE

	// Normalizes `src` with `chars_map` and returns normalized Chars.
	// `max_len` specifies the maximum length of the key in `chars_map`.
	Builder::Chars Normalize(const Builder::CharsMap &chars_map,
	const Builder::Chars &src, int max_len) {
	CHECK_GE(max_len, 1);
	Builder::Chars normalized;

	for (size_t i = 0; i < src.size();) {
	Builder::CharsMap::const_iterator it = chars_map.end();
	const size_t slice = std::min<size_t>(i + max_len, src.size());
	// starts with the longest prefix.
	Builder::Chars key(src.begin() + i, src.begin() + slice);
	while (!key.empty()) {
	it = chars_map.find(key);
	if (it != chars_map.end()) {
	break;
	}
	key.pop_back(); // remove the last character.
	}

	// Consumes one character when no rule is found.
	if (it == chars_map.end()) {
	normalized.push_back(src[i]);
	++i;
	} else {
	std::copy(it->second.begin(), it->second.end(),
	std::back_inserter(normalized));
	i += it->first.size();
	}
	}

	return normalized;
	}
	} // namespace

	// static
	util::Status Builder::CompileCharsMap(const CharsMap &chars_map,
	std::string *output) {
	CHECK_OR_RETURN(output);
	CHECK_OR_RETURN(!chars_map.empty());

	LOG(INFO) << "Loading CharsMap of size=" << chars_map.size();

	// Aggregates the same target strings to save footprint.
	std::map<Chars, int> normalized2pos;
	for (const auto &p : chars_map) {
	normalized2pos[p.second] = 0;
	}

	std::string normalized;
	for (auto &p : normalized2pos) {
	p.second = normalized.size(); // stores the pointer (position).
	const std::string utf8_out = string_util::UnicodeTextToUTF8(p.first);
	CHECK_OR_RETURN(string_util::IsStructurallyValid(utf8_out));
	normalized += utf8_out;
	normalized += '\0';
	}

	std::vector<std::pair<std::string, int>> kv; // key-value of Trie.
	for (const auto &p : chars_map) {
	// The value of Trie stores the pointer to the normalized string.
	const std::string utf8_in = string_util::UnicodeTextToUTF8(p.first);
	CHECK_OR_RETURN(!utf8_in.empty());
	CHECK_OR_RETURN(string_util::IsStructurallyValid(utf8_in));
	kv.emplace_back(utf8_in, port::FindOrDie(normalized2pos, p.second));
	}

	std::sort(kv.begin(), kv.end());
	std::vector<const char *> key(kv.size());
	std::vector<int> value(kv.size());
	for (size_t i = 0; i < kv.size(); ++i) {
	key[i] = kv[i].first.c_str();
	value[i] = kv[i].second;
	}

	Darts::DoubleArray trie;
	CHECK_EQ_OR_RETURN(0, trie.build(key.size(), const_cast<char **>(&key[0]),
	nullptr, &value[0]))
	<< "cannot build double-array";

	int max_nodes_size = 0;
	std::vector<Darts::DoubleArray::result_pair_type> results(
	2 * Normalizer::kMaxTrieResultsSize);
	for (const char *str : key) {
	const int num_nodes = trie.commonPrefixSearch(str, results.data(),
	results.size(), strlen(str));
	max_nodes_size = std::max(num_nodes, max_nodes_size);
	}
	CHECK_LT_OR_RETURN(max_nodes_size, Normalizer::kMaxTrieResultsSize)
	<< "This charmaps contain many shared prefix. "
	<< "The number of shared prefix must be less than "
	<< Normalizer::kMaxTrieResultsSize;

	absl::string_view trie_blob(static_cast<const char *>(trie.array()),
	trie.size() * trie.unit_size());
	*output = Normalizer::EncodePrecompiledCharsMap(trie_blob, normalized);

	LOG(INFO) << "Generated normalizer blob. size=" << output->size();

	return util::OkStatus();
	}

	// static
	util::Status Builder::DecompileCharsMap(absl::string_view blob,
	Builder::CharsMap *chars_map) {
	CHECK_OR_RETURN(chars_map);
	chars_map->clear();

	absl::string_view trie_blob, normalized;
	std::string buf;
	RETURN_IF_ERROR(Normalizer::DecodePrecompiledCharsMap(blob, &trie_blob,
	&normalized, &buf));

	Darts::DoubleArray trie;
	trie.set_array(const_cast<char *>(trie_blob.data()),
	trie_blob.size() / trie.unit_size());

	std::string key;
	std::function<void(size_t, size_t)> traverse;

	// Given a Trie node at `node_pos` and the key position at `key_position`,
	// Expands children nodes from `node_pos`.
	// When leaf nodes are found, stores them into `chars_map`.
	traverse = [&traverse, &key, &trie, &normalized, &chars_map](
	size_t node_pos, size_t key_pos) -> void {
	for (int c = 0; c <= 255; ++c) {
	key.push_back(static_cast<char>(c));
	size_t copied_node_pos = node_pos;
	size_t copied_key_pos = key_pos;
	// Note: `copied_(node\|key)_pos` are non-const references.
	// They store the new positions after node traversal.
	const Darts::DoubleArray::result_type result = trie.traverse(
	key.data(), copied_node_pos, copied_key_pos, key.size());
	if (result >= -1) { // node exists.
	if (result >= 0) { // has a value after transition.
	const absl::string_view value = normalized.data() + result;
	Chars key_chars, value_chars;
	for (const auto c : string_util::UTF8ToUnicodeText(key))
	key_chars.push_back(c);
	for (const auto c : string_util::UTF8ToUnicodeText(value))
	value_chars.push_back(c);
	(*chars_map)[key_chars] = value_chars;
	}
	// Recursively traverse.
	traverse(copied_node_pos, copied_key_pos);
	}
	key.pop_back();
	}
	};

	traverse(0, 0);

	return util::OkStatus();
	}

	// static
	util::Status Builder::GetPrecompiledCharsMap(absl::string_view name,
	std::string *output) {
	CHECK_OR_RETURN(output);

	if (name == "identity") {
	output->clear();
	return util::OkStatus();
	}

	std::string result;
	for (size_t i = 0; i < kNormalizationRules_size; ++i) {
	const auto *blob = &kNormalizationRules_blob[i];
	if (blob->name == name) {
	output->assign(blob->data, blob->size);
	return util::OkStatus();
	}
	}
	return util::StatusBuilder(util::StatusCode::kNotFound, GTL_LOC)
	<< "No precompiled charsmap is found: " << name;
	}

	#ifdef ENABLE_NFKC_COMPILE
	namespace {
	util::Status BuildMapInternal(
	Builder::CharsMap *chars_map,
	std::function<Builder::Chars(const Builder::Chars &)> composer,
	std::function<Builder::Chars(const Builder::Chars &)> decomposer) {
	#ifdef ENABLE_NFKC_COMPILE
	// Set of fully NFKD decomposed characters.
	std::set<Builder::Chars> nfkd_decomposed;

	// Fully normalized one character to unnormalized one character map.
	std::map<char32, std::set<char32>> norm2orig;

	Builder::CharsMap nfkc_map; // The final NFKC mapping.

	constexpr int kMaxUnicode = 0x10FFFF;
	for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
	if (!U_IS_UNICODE_CHAR(cp)) {
	continue;
	}
	// Aggregates single character to fully NFKC normalized characters.
	const auto nfkc = composer({cp});
	if (nfkc.size() >= 2 \|\| (nfkc.size() == 1 && nfkc[0] != cp)) {
	nfkc_map[{cp}] = nfkc;
	}
	const auto nfkd = decomposer({cp});
	if (nfkd.size() == 1) {
	// Aggregates reverse mapping from normalized to unnormalized character.
	norm2orig[nfkd[0]].insert(cp);
	} else {
	// One character is decomposed into multiple characters.
	nfkd_decomposed.insert(nfkd);
	}
	}

	for (const auto &nfkd : nfkd_decomposed) {
	const auto nfkc = composer(nfkd);
	// This case is already covered by single-character to NFKC mapping.
	if (nfkc == nfkd) {
	continue;
	}
	// Expand all possible sequences which are normalized into the same
	// `nfkd`.
	for (const auto &nfkd_orig : ExpandUnnormalized(nfkd, norm2orig)) {
	if (nfkd_orig != nfkc) {
	nfkc_map[nfkd_orig] = nfkc;
	}
	}
	}

	RETURN_IF_ERROR(Builder::RemoveRedundantMap(&nfkc_map));
	*chars_map = std::move(nfkc_map);
	#endif // ENABLE_NFKC_COMPILE
	return util::OkStatus();
	}
	} // namespace
	#endif // ENABLE_NFKC_COMPILE

	// static
	util::Status Builder::BuildNFKCMap(CharsMap *chars_map) {
	#ifdef ENABLE_NFKC_COMPILE
	LOG(INFO) << "Running BuildNFKCMap";
	BuildMapInternal(chars_map, ToNFKC, ToNFKD);
	#else
	LOG(ERROR) << kCompileError;
	#endif

	return util::OkStatus();
	}

	// static
	util::Status Builder::BuildNFCMap(CharsMap *chars_map) {
	#ifdef ENABLE_NFKC_COMPILE
	LOG(INFO) << "Running BuildNFCMap";
	BuildMapInternal(chars_map, ToNFC, ToNFD);
	#else
	LOG(ERROR) << kCompileError;
	#endif
	return util::OkStatus();
	}

	util::Status Builder::BuildNmtNFKCMap(CharsMap *chars_map) {
	#ifdef ENABLE_NFKC_COMPILE
	LOG(INFO) << "Running BuildNmtNFKCMap";

	CharsMap nfkc_map;
	RETURN_IF_ERROR(BuildNFKCMap(&nfkc_map));
	RETURN_IF_ERROR(MergeNmtMap(&nfkc_map));
	RETURN_IF_ERROR(RemoveRedundantMap(&nfkc_map));

	*chars_map = std::move(nfkc_map);
	#else
	LOG(ERROR) << kCompileError;
	#endif

	return util::OkStatus();
	}

	// static
	util::Status Builder::MergeUnicodeCaseFoldMap(Builder::CharsMap *chars_map) {
	#ifdef ENABLE_NFKC_COMPILE
	for (auto &c : *chars_map) {
	std::vector<char32> trg;
	for (char32 c : c.second) trg.push_back(u_foldCase(c, U_FOLD_CASE_DEFAULT));
	c.second = trg;
	}

	constexpr int kMaxUnicode = 0x10FFFF;
	for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
	if (!U_IS_UNICODE_CHAR(cp)) {
	continue;
	}
	if (chars_map->find({cp}) != chars_map->end()) continue;
	const char32 trg = u_foldCase(cp, U_FOLD_CASE_DEFAULT);
	if (trg != cp) (*chars_map)[{cp}] = {trg};
	}

	RETURN_IF_ERROR(RemoveRedundantMap(chars_map));
	#endif

	return util::OkStatus();
	}

	// static
	util::Status Builder::MergeNmtMap(Builder::CharsMap *chars_map) {
	// Other code points considered as whitespace.
	(*chars_map)[{0x0009}] = {0x20}; // TAB
	(*chars_map)[{0x000A}] = {0x20}; // LINE FEED
	(*chars_map)[{0x000C}] = {0x20}; // FORM FEED
	(*chars_map)[{0x000D}] = {0x20}; // CARRIAGE RETURN
	(*chars_map)[{0x1680}] = {0x20}; // OGHAM SPACE MARK
	(*chars_map)[{0x200B}] = {0x20}; // ZERO WIDTH SPACE
	(*chars_map)[{0x200E}] = {0x20}; // LEFT-TO-RIGHT MARK
	(*chars_map)[{0x200F}] = {0x20}; // RIGHT-TO-LEFT MARK
	(*chars_map)[{0x2028}] = {0x20}; // LINE SEPARATOR
	(*chars_map)[{0x2029}] = {0x20}; // PARAGRAPH SEPARATOR
	(*chars_map)[{0x2581}] = {0x20}; // LOWER ONE EIGHT BLOCK
	(*chars_map)[{0xFEFF}] = {0x20}; // ZERO WIDTH NO-BREAK
	(*chars_map)[{0xFFFD}] = {0x20}; // REPLACEMENT CHARACTER
	(*chars_map)[{0x200C}] = {0x20}; // ZERO WIDTH NON-JOINER
	// (*chars_map)[{0x200D}] = {0x20}; // ZERO WIDTH JOINER

	// Ascii Control characters
	(*chars_map)[{0x0001}] = {};
	(*chars_map)[{0x0002}] = {};
	(*chars_map)[{0x0003}] = {};
	(*chars_map)[{0x0004}] = {};
	(*chars_map)[{0x0005}] = {};
	(*chars_map)[{0x0006}] = {};
	(*chars_map)[{0x0007}] = {};
	(*chars_map)[{0x0008}] = {};
	(*chars_map)[{0x000B}] = {};
	(*chars_map)[{0x000E}] = {};
	(*chars_map)[{0x000F}] = {};
	(*chars_map)[{0x0010}] = {};
	(*chars_map)[{0x0011}] = {};
	(*chars_map)[{0x0012}] = {};
	(*chars_map)[{0x0013}] = {};
	(*chars_map)[{0x0014}] = {};
	(*chars_map)[{0x0015}] = {};
	(*chars_map)[{0x0016}] = {};
	(*chars_map)[{0x0017}] = {};
	(*chars_map)[{0x0018}] = {};
	(*chars_map)[{0x0019}] = {};
	(*chars_map)[{0x001A}] = {};
	(*chars_map)[{0x001B}] = {};
	(*chars_map)[{0x001C}] = {};
	(*chars_map)[{0x001D}] = {};
	(*chars_map)[{0x001E}] = {};
	(*chars_map)[{0x001F}] = {};

	// <control-007F>..<control-009F>
	(*chars_map)[{0x007F}] = {};
	(*chars_map)[{0x008F}] = {};
	(*chars_map)[{0x009F}] = {};

	// Do not normalize FULL_WIDTH TILDE, since FULL_WIDTH TILDE
	// and HALF_WIDTH TILDE are used differently in Japanese.
	(*chars_map).erase({0xFF5E});

	return util::OkStatus();
	}

	// static
	util::Status Builder::BuildNFKC_CFMap(CharsMap *chars_map) {
	#ifdef ENABLE_NFKC_COMPILE
	CharsMap nfkc_map;
	RETURN_IF_ERROR(Builder::BuildNFKCMap(&nfkc_map));
	RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfkc_map));
	*chars_map = std::move(nfkc_map);
	#else
	LOG(ERROR) << kCompileError;
	#endif

	return util::OkStatus();
	}

	// static
	util::Status Builder::BuildNmtNFKC_CFMap(CharsMap *chars_map) {
	#ifdef ENABLE_NFKC_COMPILE
	CharsMap nfkc_map;
	RETURN_IF_ERROR(Builder::BuildNmtNFKCMap(&nfkc_map));
	RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfkc_map));
	*chars_map = std::move(nfkc_map);
	#else
	LOG(ERROR) << kCompileError;
	#endif

	return util::OkStatus();
	}

	// static
	util::Status Builder::BuildNFKDMap(CharsMap *chars_map) {
	#ifdef ENABLE_NFKC_COMPILE
	constexpr int kMaxUnicode = 0x10FFFF;
	for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
	if (!U_IS_UNICODE_CHAR(cp)) {
	continue;
	}
	const auto nfkd = ToNFKD({cp});
	if (nfkd.size() >= 2 \|\| (nfkd.size() == 1 && nfkd[0] != cp)) {
	(*chars_map)[{cp}] = nfkd;
	}
	}
	#else
	LOG(ERROR) << kCompileError;
	#endif
	return util::OkStatus();
	}

	// static
	util::Status Builder::BuildNFDMap(CharsMap *chars_map) {
	#ifdef ENABLE_NFKC_COMPILE
	constexpr int kMaxUnicode = 0x10FFFF;
	for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
	if (!U_IS_UNICODE_CHAR(cp)) {
	continue;
	}
	const auto nfd = ToNFD({cp});
	if (nfd.size() >= 2 \|\| (nfd.size() == 1 && nfd[0] != cp)) {
	(*chars_map)[{cp}] = nfd;
	}
	}

	#else
	LOG(ERROR) << kCompileError;
	#endif
	return util::OkStatus();
	}

	// static
	util::Status Builder::BuildNFKD_CFMap(CharsMap *chars_map) {
	#ifdef ENABLE_NFKC_COMPILE
	CharsMap nfkd_map;
	RETURN_IF_ERROR(Builder::BuildNFKDMap(&nfkd_map));
	RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfkd_map));
	*chars_map = std::move(nfkd_map);
	#else
	LOG(ERROR) << kCompileError;
	#endif
	return util::OkStatus();
	}

	// static
	util::Status Builder::BuildNFC_CFMap(CharsMap *chars_map) {
	#ifdef ENABLE_NFKC_COMPILE
	CharsMap nfc_map;
	RETURN_IF_ERROR(Builder::BuildNFKDMap(&nfc_map));
	RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfc_map));
	*chars_map = std::move(nfc_map);
	#else
	LOG(ERROR) << kCompileError;
	#endif
	return util::OkStatus();
	}

	// static
	util::Status Builder::BuildNFD_CFMap(CharsMap *chars_map) {
	#ifdef ENABLE_NFKC_COMPILE
	CharsMap nfd_map;
	RETURN_IF_ERROR(Builder::BuildNFDMap(&nfd_map));
	RETURN_IF_ERROR(Builder::MergeUnicodeCaseFoldMap(&nfd_map));
	*chars_map = std::move(nfd_map);
	#else
	LOG(ERROR) << kCompileError;
	#endif
	return util::OkStatus();
	}

	// static
	util::Status Builder::LoadCharsMap(absl::string_view filename,
	CharsMap *chars_map) {
	LOG(INFO) << "Loading mapping file: " << filename.data();
	CHECK_OR_RETURN(chars_map);

	auto input = filesystem::NewReadableFile(filename);

	RETURN_IF_ERROR(input->status());

	std::string line;
	chars_map->clear();
	while (input->ReadLine(&line)) {
	std::vector<std::string> fields =
	absl::StrSplit(line, '\t', absl::AllowEmpty());
	CHECK_GE(fields.size(), 1);
	if (fields.size() == 1) fields.push_back(""); // Deletion rule.
	std::vector<char32> src, trg;
	for (auto s : absl::StrSplit(fields[0], ' ')) {
	if (s.empty()) continue;
	absl::ConsumePrefix(&s, "U+");
	src.push_back(string_util::HexToInt<char32>(s));
	}
	for (auto s : absl::StrSplit(fields[1], ' ')) {
	if (s.empty()) continue;
	absl::ConsumePrefix(&s, "U+");
	trg.push_back(string_util::HexToInt<char32>(s));
	}
	CHECK_OR_RETURN(!src.empty());
	(*chars_map)[src] = trg;
	}

	return util::OkStatus();
	}

	// static
	util::Status Builder::SaveCharsMap(absl::string_view filename,
	const Builder::CharsMap &chars_map) {
	auto output = filesystem::NewWritableFile(filename);
	RETURN_IF_ERROR(output->status());

	for (const auto &c : chars_map) {
	std::vector<std::string> src, trg;
	string_util::UnicodeText srcu, trgu;
	for (char32 v : c.first) {
	src.push_back(string_util::IntToHex(v));
	srcu.push_back(v);
	}
	for (char32 v : c.second) {
	trg.push_back(string_util::IntToHex(v));
	trgu.push_back(v);
	}
	std::string line = absl::StrJoin(src, " ") + "\t" +
	absl::StrJoin(trg, " ") + "\t# " +
	string_util::UnicodeTextToUTF8(c.first) + " => " +
	string_util::UnicodeTextToUTF8(c.second);
	line = absl::StrReplaceAll(
	line,
	{{"\b", " "}, {"\v", " "}, {"\f", " "}, {"\n", " "}, {"\r", " "}});
	output->WriteLine(line);
	}

	return util::OkStatus();
	}

	// static
	util::Status Builder::RemoveRedundantMap(CharsMap *chars_map) {
	CHECK_OR_RETURN(chars_map);

	CharsMap new_chars_map;
	size_t max_len = 0;
	for (const auto &p : *chars_map) {
	max_len = std::max(p.first.size(), max_len);
	if (p.first.size() == 1) {
	new_chars_map.insert(p);
	}
	}
	CHECK_GT_OR_RETURN(max_len, 0);

	// Checks whether the rules with size of `len` can be normalized by
	// the rules with size of [1 .. len - 1].
	for (size_t len = 2; len <= max_len; ++len) {
	for (const auto &p : *chars_map) {
	if (p.first.size() == len &&
	p.second != Normalize(new_chars_map, p.first, len - 1)) {
	new_chars_map.insert(p);
	}
	}
	}

	// Verify all characters in `chars_map` are normalized by `new_chars_map`.
	for (const auto &p : *chars_map) {
	CHECK_EQ_OR_RETURN(p.second, Normalize(new_chars_map, p.first, max_len));
	}

	*chars_map = std::move(new_chars_map);

	return util::OkStatus();
	}
	} // namespace normalizer
	} // namespace sentencepiece