Spaces:
Sleeping
Sleeping
// Copyright 2016 Google Inc. | |
// | |
// Licensed under the Apache License, Version 2.0 (the "License"); | |
// you may not use this file except in compliance with the License. | |
// You may obtain a copy of the License at | |
// | |
// http://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, software | |
// distributed under the License is distributed on an "AS IS" BASIS, | |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
// See the License for the specific language governing permissions and | |
// limitations under the License.! | |
namespace sentencepiece { | |
namespace normalizer { | |
namespace { | |
// Space symbol | |
// Replacement char | |
NormalizerSpec MakeDefaultSpec() { | |
return SentencePieceTrainer::GetNormalizerSpec("nmt_nfkc"); | |
} | |
} // namespace | |
TEST(NormalizerTest, NormalizeTest) { | |
auto spec = MakeDefaultSpec(); | |
const Normalizer normalizer(spec); | |
// Empty strings. | |
EXPECT_EQ("", normalizer.Normalize("")); | |
EXPECT_EQ("", normalizer.Normalize(" ")); | |
EXPECT_EQ("", normalizer.Normalize(" ")); | |
// Sentence with heading/tailing/redundant spaces. | |
EXPECT_EQ(WS "ABC", normalizer.Normalize("ABC")); | |
EXPECT_EQ(WS "ABC", normalizer.Normalize(" ABC ")); | |
EXPECT_EQ(WS "A" WS "B" WS "C", normalizer.Normalize(" A B C ")); | |
EXPECT_EQ(WS "ABC", normalizer.Normalize(" ABC ")); | |
EXPECT_EQ(WS "ABC", normalizer.Normalize(" ABC ")); | |
EXPECT_EQ(WS "ABC", normalizer.Normalize(" ABC")); | |
EXPECT_EQ(WS "ABC", normalizer.Normalize(" ABC ")); | |
// NFKC char to char normalization. | |
EXPECT_EQ(WS "123", normalizer.Normalize("①②③")); | |
// NFKC char to multi-char normalization. | |
EXPECT_EQ(WS "株式会社", normalizer.Normalize("㍿")); | |
// Half width katakana, character composition happens. | |
EXPECT_EQ(WS "グーグル", normalizer.Normalize(" グーグル ")); | |
EXPECT_EQ(WS "I" WS "saw" WS "a" WS "girl", | |
normalizer.Normalize(" I saw a girl ")); | |
// Remove control chars. | |
EXPECT_EQ("", normalizer.Normalize(string_util::UnicodeCharToUTF8(0x7F))); | |
EXPECT_EQ("", normalizer.Normalize(string_util::UnicodeCharToUTF8(0x8F))); | |
EXPECT_EQ("", normalizer.Normalize(string_util::UnicodeCharToUTF8(0x9F))); | |
EXPECT_EQ("", normalizer.Normalize(string_util::UnicodeCharToUTF8(0x0B))); | |
for (char32 c = 0x10; c <= 0x1F; ++c) { | |
EXPECT_EQ("", normalizer.Normalize(string_util::UnicodeCharToUTF8(c))); | |
} | |
} | |
TEST(NormalizerTest, NormalizeWithoutDummyPrefixTest) { | |
auto spec = MakeDefaultSpec(); | |
spec.set_add_dummy_prefix(false); | |
const Normalizer normalizer(spec); | |
// Empty strings. | |
EXPECT_EQ("", normalizer.Normalize("")); | |
EXPECT_EQ("", normalizer.Normalize(" ")); | |
EXPECT_EQ("", normalizer.Normalize(" ")); | |
// Sentence with heading/tailing/redundant spaces. | |
EXPECT_EQ("ABC", normalizer.Normalize("ABC")); | |
EXPECT_EQ("ABC", normalizer.Normalize(" ABC ")); | |
EXPECT_EQ("A" WS "B" WS "C", normalizer.Normalize(" A B C ")); | |
EXPECT_EQ("ABC", normalizer.Normalize(" ABC ")); | |
EXPECT_EQ("ABC", normalizer.Normalize(" ABC ")); | |
EXPECT_EQ("ABC", normalizer.Normalize(" ABC")); | |
EXPECT_EQ("ABC", normalizer.Normalize(" ABC ")); | |
} | |
TEST(NormalizerTest, NormalizeTreatWSAsSuffixTest) { | |
auto spec = MakeDefaultSpec(); | |
TrainerSpec trainer_spec; | |
trainer_spec.set_treat_whitespace_as_suffix(true); | |
const Normalizer normalizer(spec, trainer_spec); | |
EXPECT_EQ("", normalizer.Normalize("")); | |
EXPECT_EQ("", normalizer.Normalize(" ")); | |
EXPECT_EQ("", normalizer.Normalize(" ")); | |
EXPECT_EQ("ABC" WS, normalizer.Normalize("ABC")); | |
EXPECT_EQ("ABC" WS, normalizer.Normalize(" ABC ")); | |
EXPECT_EQ("A" WS "B" WS "C" WS, normalizer.Normalize(" A B C ")); | |
EXPECT_EQ("ABC" WS, normalizer.Normalize(" ABC ")); | |
} | |
TEST(NormalizerTest, NormalizeWithoutRemoveExtraWhitespacesTest) { | |
auto spec = MakeDefaultSpec(); | |
spec.set_remove_extra_whitespaces(false); | |
const Normalizer normalizer(spec); | |
// Empty strings. | |
EXPECT_EQ("", normalizer.Normalize("")); | |
EXPECT_EQ(WS WS WS WS WS WS WS, normalizer.Normalize(" ")); | |
EXPECT_EQ(WS WS, normalizer.Normalize(" ")); | |
// Sentence with heading/tailing/redundant spaces. | |
EXPECT_EQ(WS "ABC", normalizer.Normalize("ABC")); | |
EXPECT_EQ(WS WS "ABC" WS, normalizer.Normalize(" ABC ")); | |
EXPECT_EQ(WS WS WS "A" WS WS "B" WS WS "C" WS WS, | |
normalizer.Normalize(" A B C ")); | |
} | |
TEST(NormalizerTest, NormalizeWithoutEscapeWhitespacesTest) { | |
auto spec = MakeDefaultSpec(); | |
spec.set_add_dummy_prefix(false); | |
spec.set_remove_extra_whitespaces(true); | |
spec.set_escape_whitespaces(false); | |
const Normalizer normalizer(spec); | |
// Empty strings. | |
EXPECT_EQ("", normalizer.Normalize("")); | |
EXPECT_EQ("", normalizer.Normalize(" ")); | |
EXPECT_EQ("", normalizer.Normalize(" ")); | |
// Sentence with heading/tailing/redundant spaces. | |
EXPECT_EQ("ABC", normalizer.Normalize("ABC")); | |
EXPECT_EQ("ABC", normalizer.Normalize(" ABC ")); | |
EXPECT_EQ("A B C", normalizer.Normalize(" A B C ")); | |
EXPECT_EQ("A B C", normalizer.Normalize("A B C")); | |
} | |
TEST(NormalizeTest, NomalizeWithSpaceContainedRules) { | |
Builder::CharsMap charsmap; | |
auto AddRule = [&](const std::string &src, const std::string &trg) { | |
Builder::Chars src_chars, trg_chars; | |
for (const char32 c : string_util::UTF8ToUnicodeText(src)) { | |
src_chars.push_back(c); | |
} | |
for (const char32 c : string_util::UTF8ToUnicodeText(trg)) { | |
trg_chars.push_back(c); | |
} | |
charsmap[src_chars] = trg_chars; | |
}; | |
// Adds rules containing whitespaes. | |
AddRule("a", " A"); | |
AddRule("b", "B"); | |
AddRule("c", "D E"); | |
AddRule("d", " F G "); | |
NormalizerSpec spec; | |
EXPECT_TRUE( | |
Builder::CompileCharsMap(charsmap, spec.mutable_precompiled_charsmap()) | |
.ok()); | |
// Test default behavior | |
{ | |
const Normalizer normalizer(spec); | |
EXPECT_EQ(WS "A", normalizer.Normalize("a")); | |
EXPECT_EQ(WS "B" WS "A", normalizer.Normalize("ba")); | |
EXPECT_EQ(WS "D" WS "E", normalizer.Normalize("c")); | |
EXPECT_EQ(WS "F" WS "G" WS "A", normalizer.Normalize("da")); | |
EXPECT_EQ(WS "A" WS "F" WS "G", normalizer.Normalize("ad")); | |
EXPECT_EQ(WS "A" WS "F" WS "G" WS "B", normalizer.Normalize("adb")); | |
} | |
spec.set_escape_whitespaces(false); | |
{ | |
spec.set_add_dummy_prefix(false); | |
spec.set_remove_extra_whitespaces(true); | |
const Normalizer normalizer(spec); | |
EXPECT_EQ("A", normalizer.Normalize("a")); | |
EXPECT_EQ("B A", normalizer.Normalize("ba")); | |
EXPECT_EQ("D E", normalizer.Normalize("c")); | |
EXPECT_EQ("F G A", normalizer.Normalize("da")); | |
EXPECT_EQ("A F G", normalizer.Normalize("ad")); | |
EXPECT_EQ("A F G B", normalizer.Normalize("adb")); | |
} | |
{ | |
spec.set_add_dummy_prefix(false); | |
spec.set_remove_extra_whitespaces(false); | |
const Normalizer normalizer(spec); | |
EXPECT_EQ(" A", normalizer.Normalize("a")); | |
EXPECT_EQ("B A", normalizer.Normalize("ba")); | |
EXPECT_EQ("D E", normalizer.Normalize("c")); | |
EXPECT_EQ(" F G A", normalizer.Normalize("da")); | |
EXPECT_EQ(" A F G ", normalizer.Normalize("ad")); | |
EXPECT_EQ(" A F G B", normalizer.Normalize("adb")); | |
} | |
{ | |
spec.set_add_dummy_prefix(true); | |
spec.set_remove_extra_whitespaces(true); | |
const Normalizer normalizer(spec); | |
EXPECT_EQ(" A", normalizer.Normalize("a")); | |
EXPECT_EQ(" B A", normalizer.Normalize("ba")); | |
EXPECT_EQ(" D E", normalizer.Normalize("c")); | |
EXPECT_EQ(" F G A", normalizer.Normalize("da")); | |
EXPECT_EQ(" A F G", normalizer.Normalize("ad")); | |
EXPECT_EQ(" A F G B", normalizer.Normalize("adb")); | |
} | |
{ | |
spec.set_add_dummy_prefix(true); | |
spec.set_remove_extra_whitespaces(false); | |
const Normalizer normalizer(spec); | |
EXPECT_EQ(" A", normalizer.Normalize("a")); | |
EXPECT_EQ(" B A", normalizer.Normalize("ba")); | |
EXPECT_EQ(" D E", normalizer.Normalize("c")); | |
EXPECT_EQ(" F G A", normalizer.Normalize("da")); | |
EXPECT_EQ(" A F G ", normalizer.Normalize("ad")); | |
EXPECT_EQ(" A F G B", normalizer.Normalize("adb")); | |
} | |
// Added several corner cases around spaces. | |
struct SpacePattern { | |
bool add_dummy_prefix; | |
bool remove_extra_whitespaces; | |
bool escape_whitespaces; | |
const char *input; | |
const char *expected; | |
}; | |
constexpr SpacePattern kSpacePatternData[] = { | |
{false, false, false, WS, WS}, {false, false, true, WS, WS}, | |
{false, true, false, WS, WS}, {false, true, true, WS, ""}, | |
{true, false, false, WS, " " WS}, {true, false, true, WS, WS WS}, | |
{true, true, false, WS, " " WS}, {true, true, true, WS, ""}, | |
{false, false, false, " ", " "}, {false, false, true, " ", WS}, | |
{false, true, false, " ", ""}, {false, true, true, " ", ""}, | |
{true, false, false, " ", " "}, {true, false, true, " ", WS WS}, | |
{true, true, false, " ", ""}, {true, true, true, " ", ""}}; | |
for (const auto &c : kSpacePatternData) { | |
spec.set_add_dummy_prefix(c.add_dummy_prefix); | |
spec.set_remove_extra_whitespaces(c.remove_extra_whitespaces); | |
spec.set_escape_whitespaces(c.escape_whitespaces); | |
const Normalizer normalizer(spec); | |
EXPECT_EQ(c.expected, normalizer.Normalize(c.input)); | |
} | |
} | |
TEST(NormalizerTest, NormalizeReplacementChar) { | |
auto spec = MakeDefaultSpec(); | |
spec.set_add_dummy_prefix(false); | |
const Normalizer normalizer(spec); | |
EXPECT_EQ("abc" RC "xy", normalizer.Normalize("abc\x80xy")); | |
EXPECT_EQ("abc" RC, normalizer.Normalize("abc\xc3")); | |
EXPECT_EQ("ab" RC RC "xy", normalizer.Normalize("ab\xe3\x81xy")); | |
EXPECT_EQ("a" RC RC RC "xy", normalizer.Normalize("a\xf3\x81\x81xy")); | |
EXPECT_EQ("ab" RC RC "xy", normalizer.Normalize("ab\xc0\x82xy")); | |
} | |
TEST(NormalizerTest, NormalizeFullTest) { | |
std::vector<size_t> n2i; | |
std::string output; | |
auto spec = MakeDefaultSpec(); | |
const Normalizer normalizer(spec); | |
{ | |
const std::string input = "I saw a girl"; | |
EXPECT_TRUE(normalizer.Normalize(input, &output, &n2i).ok()); | |
EXPECT_EQ(WS "I" WS "saw" WS "a" WS "girl", output); | |
const std::vector<size_t> expected = {0, 0, 0, // WS (3byte) | |
0, // I | |
1, 1, 1, // WS | |
2, 3, 4, // saw | |
5, 5, 5, // WS | |
6, // a | |
7, 7, 7, // WS | |
8, 9, 10, 11, // girl | |
12}; | |
EXPECT_EQ(expected, n2i); | |
} | |
{ | |
const std::string input = " I saw a girl "; | |
EXPECT_TRUE(normalizer.Normalize(input, &output, &n2i).ok()); | |
LOG(INFO) << output; | |
EXPECT_EQ(WS "I" WS "saw" WS "a" WS "girl", output); | |
const std::vector<size_t> expected = {1, 1, 1, // WS (3byte) | |
1, // I | |
2, 2, 2, // WS | |
5, 6, 7, // saw | |
8, 8, 8, // WS | |
9, // a | |
10, 10, 10, // WS | |
17, 18, 19, 20, // girl | |
21}; | |
EXPECT_EQ(expected, n2i); | |
} | |
{ | |
const std::string input = " グーグル "; // halfwidth katakana | |
EXPECT_TRUE(normalizer.Normalize(input, &output, &n2i).ok()); | |
EXPECT_EQ(WS "グーグル", output); | |
const std::vector<size_t> expected = {1, 1, 1, // WS (3byte) | |
1, 1, 1, // グ | |
7, 7, 7, // ー | |
10, 10, 10, // グ | |
16, 16, 16, // ル | |
19}; | |
EXPECT_EQ(expected, n2i); | |
} | |
{ | |
const std::string input = "①②③"; | |
EXPECT_TRUE(normalizer.Normalize(input, &output, &n2i).ok()); | |
EXPECT_EQ(WS "123", output); | |
const std::vector<size_t> expected = {0, 0, 0, // WS (3byte) | |
0, // 1 | |
3, // 2 | |
6, // 3 | |
9}; | |
EXPECT_EQ(expected, n2i); | |
} | |
{ | |
const std::string input = "㍿"; | |
EXPECT_TRUE(normalizer.Normalize(input, &output, &n2i).ok()); | |
EXPECT_EQ(WS "株式会社", output); | |
const std::vector<size_t> expected = {0, 0, 0, // WS (3byte) | |
0, 0, 0, // 株 | |
0, 0, 0, // 式 | |
0, 0, 0, // 会 | |
0, 0, 0, // 社 | |
3}; | |
// When "株式" is one piece, this has no alignment to the input. | |
// Sentencepieces which includes the last character ("会社" or "社") | |
// have the alignment to the input. | |
EXPECT_EQ(expected, n2i); | |
} | |
} | |
TEST(NormalizerTest, EncodeDecodePrecompiledCharsMapTest) { | |
const std::string blob = Normalizer::EncodePrecompiledCharsMap("foo", "bar"); | |
std::string buf; | |
absl::string_view trie_blob, normalized_blob; | |
EXPECT_TRUE(Normalizer::DecodePrecompiledCharsMap(blob, &trie_blob, | |
&normalized_blob, &buf) | |
.ok()); | |
EXPECT_EQ("foo", trie_blob); | |
EXPECT_EQ("bar", normalized_blob); | |
EXPECT_FALSE(Normalizer::DecodePrecompiledCharsMap("", &trie_blob, | |
&normalized_blob, &buf) | |
.ok()); | |
} | |
TEST(NormalizerTest, StatusTest) { | |
NormalizerSpec spec; | |
{ | |
const Normalizer normalizer(spec); | |
EXPECT_TRUE(normalizer.status().ok()); // fallback to identity. | |
} | |
{ | |
spec.set_precompiled_charsmap("x"); | |
const Normalizer normalizer(spec); | |
EXPECT_FALSE(normalizer.status().ok()); | |
} | |
spec = MakeDefaultSpec(); | |
{ | |
const Normalizer normalizer(spec); | |
EXPECT_TRUE(normalizer.status().ok()); | |
} | |
} | |
TEST(NormalizerTest, PrefixMatcherTest) { | |
const PrefixMatcher matcher({"abc", "ab", "xy", "京都"}); | |
bool found; | |
EXPECT_EQ(1, matcher.PrefixMatch("test", &found)); | |
EXPECT_FALSE(found); | |
EXPECT_EQ(3, matcher.PrefixMatch("abcd", &found)); | |
EXPECT_TRUE(found); | |
EXPECT_EQ(2, matcher.PrefixMatch("abxy", &found)); | |
EXPECT_TRUE(found); | |
EXPECT_EQ(1, matcher.PrefixMatch("x", &found)); | |
EXPECT_FALSE(found); | |
EXPECT_EQ(2, matcher.PrefixMatch("xyz", &found)); | |
EXPECT_TRUE(found); | |
EXPECT_EQ(6, matcher.PrefixMatch("京都大学", &found)); | |
EXPECT_TRUE(found); | |
EXPECT_EQ(3, matcher.PrefixMatch("東京大学", &found)); | |
EXPECT_FALSE(found); | |
EXPECT_EQ("", matcher.GlobalReplace("", "")); | |
EXPECT_EQ("", matcher.GlobalReplace("abc", "")); | |
EXPECT_EQ("--de-pqr", matcher.GlobalReplace("xyabcdeabpqr", "-")); | |
} | |
TEST(NormalizerTest, PrefixMatcherWithEmptyTest) { | |
const PrefixMatcher matcher({}); | |
bool found; | |
EXPECT_EQ(1, matcher.PrefixMatch("test", &found)); | |
EXPECT_FALSE(found); | |
EXPECT_EQ(1, matcher.PrefixMatch("abcd", &found)); | |
EXPECT_FALSE(found); | |
EXPECT_EQ(1, matcher.PrefixMatch("abxy", &found)); | |
EXPECT_FALSE(found); | |
EXPECT_EQ(1, matcher.PrefixMatch("x", &found)); | |
EXPECT_FALSE(found); | |
EXPECT_EQ(1, matcher.PrefixMatch("xyz", &found)); | |
EXPECT_FALSE(found); | |
EXPECT_EQ(3, matcher.PrefixMatch("京都大学", &found)); | |
EXPECT_FALSE(found); | |
EXPECT_EQ(3, matcher.PrefixMatch("東京大学", &found)); | |
EXPECT_FALSE(found); | |
EXPECT_EQ("", matcher.GlobalReplace("", "")); | |
EXPECT_EQ("abc", matcher.GlobalReplace("abc", "")); | |
} | |
} // namespace normalizer | |
} // namespace sentencepiece | |