Spaces:

nvidia
/

Plan2Align-NV

Sleeping

Plan2Align-NV / laser /tools-external /sentencepiece-master /src /builder.h

KuangDW

add laser tool

2aebc50 26 days ago

5.73 kB

	// Copyright 2016 Google Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.!

	#ifndef BUILDER_H_
	#define BUILDER_H_

	#include <map>
	#include <string>
	#include <vector>

	#include "common.h"
	#include "sentencepiece_model.pb.h"
	#include "sentencepiece_processor.h"
	#include "third_party/absl/strings/string_view.h"

	namespace sentencepiece {
	namespace normalizer {

	// Builder creates a text normalization rule from user-defined string
	// to string mappings. The normalization mapping is compiled into
	// a single and compact blob index which is stored into the model proto.
	// This class also provides pre-defined rules based on Unicode NFKC.
	// https://en.wikipedia.org/wiki/Unicode_equivalence#Normalization
	class Builder {
	public:
	Builder() = delete;
	~Builder() = delete;

	// Basic Unicode character sequence.
	using Chars = std::vector<char32>;

	// String-to-string mapping.
	using CharsMap = std::map<Chars, Chars>;

	static util::Status CompileCharsMap(const CharsMap &chars_map,
	std::string *output);

	// Decompiles `blob` into `chars_map`.
	static util::Status DecompileCharsMap(absl::string_view blob,
	CharsMap *chars_map);

	// Returns a pre-compiled binary index with `name`.
	static util::Status GetPrecompiledCharsMap(absl::string_view name,
	std::string *output);

	// Makes a normalization mapping based on NFKC.
	//
	// Note that Normalizer/Builder classes do not support
	// full NFKC normalization, since full NFKC normalization cannot
	// be implemented with a simple longest matching string-to-string
	// replacement. One unsupported normalization is multiple combining
	// marks.
	//
	// Strings with multiple combining marks cannot correctly
	// be normalized, because it needs to sort the combining marks
	// with Canonical_Combining_Class (CCC).
	// http://unicode.org/reports/tr15/#Multiple_Mark_Figure
	//
	// Example:
	// Original: U+1E0B U+0323
	// Decomposed: U+0064 U+0307 U+0323
	// NFKD: U+0064 U+0323 U+0307 (Combining characters are sorted by CCC)
	// NFKC: U+1E0D U+0307 (U+0064 U+0323 => U+1E0D)
	//
	// To support the normalization above with a longest matching, we need to
	// enumerate all possible permutations of combining marks in advance,
	// which is not feasible. For example, suppose the case there are three
	// combining marks X, Y and Z, which are sorted into one canonical order
	// Z, Y, X with NFK(D\|C). In this case, all permutations (XYZ, XZY, YXZ...)
	// are normalized into ZYX. When we implement this normalization with
	// a longest matching, we need to have 3! rules. XYZ=>ZYX, XZY=>ZYX..
	// Since Unicode has more than 100 combining characters, it is not possible
	// to expand all permutations.
	//
	// We will not implement the full NFKC in SentencePiece because
	// 1) It is unusual to see decomposed Unicode characters in real text.
	// 2) Providing a flexible, user-customizable, and self-contained
	// normalizer is the goal of SentencePiece.
	//
	// TODO(taku): Make NFC, NFD, and NFKD mapping if necessary.
	static util::Status BuildNFKCMap(CharsMap *chars_map);

	// Makes an NFKC-based mapping with NMT specific modifications around
	// whitespaces.
	static util::Status BuildNmtNFKCMap(CharsMap *chars_map);

	// Merge Unicode case folding mapping into `chars_map`.
	static util::Status MergeUnicodeCaseFoldMap(CharsMap *chars_map);

	static util::Status MergeNmtMap(Builder::CharsMap *chars_map);

	// Makes NFKC with Unicode case folding.
	static util::Status BuildNFKC_CFMap(CharsMap *chars_map);

	// Makes NMT NFKC with Unicode case folding.
	static util::Status BuildNmtNFKC_CFMap(CharsMap *chars_map);

	// Given NFKC maps, convert them to NFKD.
	static util::Status BuildNFKDMap(CharsMap *chars_map);

	// Builds NFC map.
	static util::Status BuildNFCMap(CharsMap *chars_map);

	// Builds NFD map.
	static util::Status BuildNFDMap(CharsMap *chars_map);

	// Makes NFKD with Unicode case folding.
	static util::Status BuildNFKD_CFMap(CharsMap *chars_map);

	// Makes NFKC with Unicode case folding.
	static util::Status BuildNFC_CFMap(CharsMap *chars_map);

	// Makes NFD with Unicode case folding.
	static util::Status BuildNFD_CFMap(CharsMap *chars_map);

	// Builds Chars map save in `filename`.
	// Format:
	// src_uchar1 src_uchar2 ... <tab> trg_uchar1 trg_uchar2...
	// (src\|trg)_ucharX must be a hex of Unicode code point.
	static util::Status LoadCharsMap(absl::string_view filename,
	CharsMap *chars_map);

	// Saves Chars map to `filename` as TSV.
	static util::Status SaveCharsMap(absl::string_view filename,
	const CharsMap &chars_map);

	// Removes redundant rules from `chars_map`.
	// When char_maps have "aa" => "bb" and "a" => "b", the first
	// rule is not necessary since the second rule can cover the first rule.
	static util::Status RemoveRedundantMap(CharsMap *chars_map);

	private:
	FRIEND_TEST(BuilderTest, RemoveRedundantMapTest);
	};
	} // namespace normalizer
	} // namespace sentencepiece
	#endif // BUILDER_H_