Spaces:
Sleeping
Sleeping
File size: 5,817 Bytes
2aebc50 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.!
#ifndef NORMALIZER_NORMALIZER_H_
#define NORMALIZER_NORMALIZER_H_
#include <memory>
#include <set>
#include <string>
#include <utility>
#include <vector>
#include "common.h"
#include "sentencepiece_model.pb.h"
#include "sentencepiece_processor.h"
#include "third_party/absl/strings/string_view.h"
#include "third_party/darts_clone/darts.h"
namespace sentencepiece {
namespace normalizer {
// Given a list of strings, finds the longest string which is a
// prefix of a query.
class PrefixMatcher {
public:
// Initializes the PrefixMatcher with `dic`.
explicit PrefixMatcher(const std::set<absl::string_view> &dic);
// Finds the longest string in dic, which is a prefix of `w`.
// Returns the UTF8 byte length of matched string.
// `found` is set if a prefix match exists.
// If no entry is found, consumes one Unicode character.
int PrefixMatch(absl::string_view w, bool *found = nullptr) const;
// Replaces entries in `w` with `out`.
std::string GlobalReplace(absl::string_view w, absl::string_view out) const;
private:
std::unique_ptr<Darts::DoubleArray> trie_;
};
// Normalizer implements a simple text normalizer with
// user-defined string-to-string rules and leftmost longest
// matching. The rules of Normalizer are built with
// Builder::CompileCharsMap() method. Pre-compiled rules are
// also available via Builder::GetPrecompiledCharsMap(<name>) method.
//
// The motivation of Normalizer is to make flexible, user-customizable
// and self-contained normalizer. All the logic of normalization is
// encoded in the model proto which allows us to define language/task
// dependent normalization rules without breaking the default rule.
class Normalizer {
public:
// Instantiates Normalizer with |spec|.
// |spec| should not be deleted until Normalizer is destroyed.
explicit Normalizer(const NormalizerSpec &spec);
Normalizer(const NormalizerSpec &spec, const TrainerSpec &trainer_Spec);
virtual ~Normalizer();
virtual void SetPrefixMatcher(const PrefixMatcher *matcher) {
matcher_ = matcher;
}
// Returns Status.
// Normalizes function is valid only when status is OK.
virtual util::Status status() const { return status_; }
// Normalizes a plain utf8 string into an internal representation for
// Sentencepiece model. |norm_to_orig| stores the byte-alignment from
// normalized string to the original input.
// This function can do the following normalizations:
// - Character normalization.
// (NFKC / full-width to half-width conversion etc).
// - Adds a prefix space.
// - Replaces a space with a meta symbol.
// - Removing heading, tailing and other redundant spaces.
virtual util::Status Normalize(absl::string_view input,
std::string *normalized,
std::vector<size_t> *norm_to_orig) const;
// Returns a normalized string without alignments.
// This function is used in sentencepiece training.
virtual std::string Normalize(absl::string_view input) const;
friend class Builder;
private:
FRIEND_TEST(NormalizerTest, EncodeDecodePrecompiledCharsMapTest);
void Init();
// Normalizes the prefix of |input| and returns the pair of
// normalized prefix and length we must consume after
// normalization.
// Here's the sample code for the full text normalization.
//
// string output;
// absl::string_view input = "...";
// while (!input.empty()) {
// const auto p = normalizer.NormalizePrefix(input);
// output.append(p.first.data(), p.first.size());
// input.remove_prefix(p.second);
// }
std::pair<absl::string_view, int> NormalizePrefix(
absl::string_view input) const;
// Encodes trie_blob and normalized string and return compiled blob.
static std::string EncodePrecompiledCharsMap(absl::string_view trie_blob,
absl::string_view normalized);
// Decodes blob into trie_blob and normalized string.
static util::Status DecodePrecompiledCharsMap(absl::string_view blob,
absl::string_view *trie_blob,
absl::string_view *normalized,
std::string *buffer = nullptr);
// Maximum size of the return value of Trie, which corresponds
// to the maximum size of shared common prefix in the chars map.
static constexpr int kMaxTrieResultsSize = 32;
// Internal trie for efficient longest matching.
std::unique_ptr<Darts::DoubleArray> trie_;
// "\0" delimitered output string.
// the value of |trie_| stores pointers to this string.
const char *normalized_ = nullptr;
// Spec for normalization.
const NormalizerSpec *spec_;
// Prefix matcher;
const PrefixMatcher *matcher_ = nullptr;
// Split hello world into "hello_" and "world_" instead of
// "_hello" and "_world".
const bool treat_whitespace_as_suffix_ = false;
#ifdef IS_BIG_ENDIAN
// Stores the blob for TRIE encoded in big-endian.
std::string precompiled_charsmap_buffer_;
#endif
// Normalizer's status.
util::Status status_;
};
} // namespace normalizer
} // namespace sentencepiece
#endif // NORMALIZER_NORMALIZER_H_
|