Spaces:
Sleeping
Sleeping
File size: 5,734 Bytes
2aebc50 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.!
#ifndef BUILDER_H_
#define BUILDER_H_
#include <map>
#include <string>
#include <vector>
#include "common.h"
#include "sentencepiece_model.pb.h"
#include "sentencepiece_processor.h"
#include "third_party/absl/strings/string_view.h"
namespace sentencepiece {
namespace normalizer {
// Builder creates a text normalization rule from user-defined string
// to string mappings. The normalization mapping is compiled into
// a single and compact blob index which is stored into the model proto.
// This class also provides pre-defined rules based on Unicode NFKC.
// https://en.wikipedia.org/wiki/Unicode_equivalence#Normalization
class Builder {
public:
Builder() = delete;
~Builder() = delete;
// Basic Unicode character sequence.
using Chars = std::vector<char32>;
// String-to-string mapping.
using CharsMap = std::map<Chars, Chars>;
static util::Status CompileCharsMap(const CharsMap &chars_map,
std::string *output);
// Decompiles `blob` into `chars_map`.
static util::Status DecompileCharsMap(absl::string_view blob,
CharsMap *chars_map);
// Returns a pre-compiled binary index with `name`.
static util::Status GetPrecompiledCharsMap(absl::string_view name,
std::string *output);
// Makes a normalization mapping based on NFKC.
//
// Note that Normalizer/Builder classes do not support
// full NFKC normalization, since full NFKC normalization cannot
// be implemented with a simple longest matching string-to-string
// replacement. One unsupported normalization is multiple combining
// marks.
//
// Strings with multiple combining marks cannot correctly
// be normalized, because it needs to sort the combining marks
// with Canonical_Combining_Class (CCC).
// http://unicode.org/reports/tr15/#Multiple_Mark_Figure
//
// Example:
// Original: U+1E0B U+0323
// Decomposed: U+0064 U+0307 U+0323
// NFKD: U+0064 U+0323 U+0307 (Combining characters are sorted by CCC)
// NFKC: U+1E0D U+0307 (U+0064 U+0323 => U+1E0D)
//
// To support the normalization above with a longest matching, we need to
// enumerate all possible permutations of combining marks in advance,
// which is not feasible. For example, suppose the case there are three
// combining marks X, Y and Z, which are sorted into one canonical order
// Z, Y, X with NFK(D|C). In this case, all permutations (XYZ, XZY, YXZ...)
// are normalized into ZYX. When we implement this normalization with
// a longest matching, we need to have 3! rules. XYZ=>ZYX, XZY=>ZYX..
// Since Unicode has more than 100 combining characters, it is not possible
// to expand all permutations.
//
// We will not implement the full NFKC in SentencePiece because
// 1) It is unusual to see decomposed Unicode characters in real text.
// 2) Providing a flexible, user-customizable, and self-contained
// normalizer is the goal of SentencePiece.
//
// TODO(taku): Make NFC, NFD, and NFKD mapping if necessary.
static util::Status BuildNFKCMap(CharsMap *chars_map);
// Makes an NFKC-based mapping with NMT specific modifications around
// whitespaces.
static util::Status BuildNmtNFKCMap(CharsMap *chars_map);
// Merge Unicode case folding mapping into `chars_map`.
static util::Status MergeUnicodeCaseFoldMap(CharsMap *chars_map);
static util::Status MergeNmtMap(Builder::CharsMap *chars_map);
// Makes NFKC with Unicode case folding.
static util::Status BuildNFKC_CFMap(CharsMap *chars_map);
// Makes NMT NFKC with Unicode case folding.
static util::Status BuildNmtNFKC_CFMap(CharsMap *chars_map);
// Given NFKC maps, convert them to NFKD.
static util::Status BuildNFKDMap(CharsMap *chars_map);
// Builds NFC map.
static util::Status BuildNFCMap(CharsMap *chars_map);
// Builds NFD map.
static util::Status BuildNFDMap(CharsMap *chars_map);
// Makes NFKD with Unicode case folding.
static util::Status BuildNFKD_CFMap(CharsMap *chars_map);
// Makes NFKC with Unicode case folding.
static util::Status BuildNFC_CFMap(CharsMap *chars_map);
// Makes NFD with Unicode case folding.
static util::Status BuildNFD_CFMap(CharsMap *chars_map);
// Builds Chars map save in `filename`.
// Format:
// src_uchar1 src_uchar2 ... <tab> trg_uchar1 trg_uchar2...
// (src|trg)_ucharX must be a hex of Unicode code point.
static util::Status LoadCharsMap(absl::string_view filename,
CharsMap *chars_map);
// Saves Chars map to `filename` as TSV.
static util::Status SaveCharsMap(absl::string_view filename,
const CharsMap &chars_map);
// Removes redundant rules from `chars_map`.
// When char_maps have "aa" => "bb" and "a" => "b", the first
// rule is not necessary since the second rule can cover the first rule.
static util::Status RemoveRedundantMap(CharsMap *chars_map);
private:
FRIEND_TEST(BuilderTest, RemoveRedundantMapTest);
};
} // namespace normalizer
} // namespace sentencepiece
#endif // BUILDER_H_
|