Spaces:
Sleeping
Sleeping
// Copyright 2016 Google Inc. | |
// | |
// Licensed under the Apache License, Version 2.0 (the "License"); | |
// you may not use this file except in compliance with the License. | |
// You may obtain a copy of the License at | |
// | |
// http://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, software | |
// distributed under the License is distributed on an "AS IS" BASIS, | |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
// See the License for the specific language governing permissions and | |
// limitations under the License.! | |
namespace sentencepiece { | |
namespace normalizer { | |
// Space symbol | |
TEST(BuilderTest, RemoveRedundantMapTest) { | |
Builder::CharsMap chars_map; | |
// ab => AB, a => A, b => B, abc => BCA | |
chars_map[{0x0061}] = {0x0041}; | |
chars_map[{0x0062}] = {0x0042}; | |
chars_map[{0x0061, 0x0062}] = {0x0041, 0x0042}; | |
chars_map[{0x0061, 0x0062, 0x0063}] = {0x0043, 0x0042, 0x0041}; | |
EXPECT_TRUE(Builder::RemoveRedundantMap(&chars_map).ok()); | |
EXPECT_EQ(3, chars_map.size()); | |
EXPECT_EQ(chars_map.end(), chars_map.find({0x0061, 0x0062})); | |
EXPECT_NE(chars_map.end(), chars_map.find({0x0061})); | |
EXPECT_NE(chars_map.end(), chars_map.find({0x0062})); | |
EXPECT_NE(chars_map.end(), chars_map.find({0x0061, 0x0062, 0x0063})); | |
} | |
TEST(BuilderTest, GetPrecompiledCharsMapWithInvalidNameTest) { | |
std::string output; | |
EXPECT_FALSE(Builder::GetPrecompiledCharsMap("", &output).ok()); | |
EXPECT_FALSE(Builder::GetPrecompiledCharsMap("__UNKNOWN__", &output).ok()); | |
} | |
TEST(BuilderTest, BuildNFKCMapTest) { | |
Builder::CharsMap chars_map; | |
EXPECT_TRUE(Builder::BuildNFKCMap(&chars_map).ok()); | |
EXPECT_TRUE(!chars_map.empty()); | |
EXPECT_TRUE(Builder::BuildNFKCMap(&chars_map).ok()); | |
} | |
TEST(BuilderTest, GetPrecompiledCharsMapTest) { | |
{ | |
const NormalizerSpec spec = | |
SentencePieceTrainer::GetNormalizerSpec("nmt_nfkc"); | |
const Normalizer normalizer(spec); | |
EXPECT_EQ(WS "ABC", normalizer.Normalize("οΌ‘οΌ’οΌ£")); | |
EXPECT_EQ(WS "(ζ ͺ)", normalizer.Normalize("γ±")); | |
EXPECT_EQ(WS "γ°γΌγ°γ«", normalizer.Normalize("ο½ΈοΎο½°ο½ΈοΎοΎ")); | |
} | |
{ | |
const NormalizerSpec spec = | |
SentencePieceTrainer::GetNormalizerSpec("nfkc_cf"); | |
const Normalizer normalizer(spec); | |
EXPECT_EQ(WS "abc", normalizer.Normalize("οΌ‘οΌ’οΌ£")); | |
EXPECT_EQ(WS "abc", normalizer.Normalize("ABC")); | |
} | |
{ | |
const NormalizerSpec spec = | |
SentencePieceTrainer::GetNormalizerSpec("nmt_nfkc_cf"); | |
const Normalizer normalizer(spec); | |
EXPECT_EQ(WS "abc", normalizer.Normalize("οΌ‘οΌ’οΌ£")); | |
EXPECT_EQ(WS "abc", normalizer.Normalize("ABC")); | |
} | |
{ | |
const NormalizerSpec spec = | |
SentencePieceTrainer::GetNormalizerSpec("identity"); | |
EXPECT_TRUE(spec.precompiled_charsmap().empty()); | |
const Normalizer normalizer(spec); | |
EXPECT_EQ(WS "οΌ‘οΌ’οΌ£", normalizer.Normalize("οΌ‘οΌ’οΌ£")); | |
EXPECT_EQ(WS "γ±", normalizer.Normalize("γ±")); | |
EXPECT_EQ(WS "ο½ΈοΎο½°ο½ΈοΎοΎ", normalizer.Normalize("ο½ΈοΎο½°ο½ΈοΎοΎ")); | |
} | |
} | |
TEST(BuilderTest, CompileCharsMap) { | |
Builder::CharsMap chars_map; | |
// Lowercase => Uppercase | |
for (char32 lc = static_cast<char32>('a'); lc <= static_cast<char32>('z'); | |
++lc) { | |
const char32 uc = lc + 'A' - 'a'; | |
chars_map[{lc}] = {uc}; | |
} | |
// γγγ => abc | |
chars_map[{0x3042, 0x3044, 0x3046}] = {0x0061, 0x0062, 0x0063}; | |
// γγ => remove | |
chars_map[{0x3048, 0x304A}] = {}; | |
NormalizerSpec spec; | |
EXPECT_TRUE( | |
Builder::CompileCharsMap(chars_map, spec.mutable_precompiled_charsmap()) | |
.ok()); | |
Builder::CharsMap decompiled_chars_map; | |
EXPECT_TRUE(Builder::DecompileCharsMap(spec.precompiled_charsmap(), | |
&decompiled_chars_map) | |
.ok()); | |
EXPECT_EQ(chars_map, decompiled_chars_map); | |
spec.set_add_dummy_prefix(false); | |
const Normalizer normalizer(spec); | |
EXPECT_EQ("ABC", normalizer.Normalize("abc")); | |
EXPECT_EQ("ABC", normalizer.Normalize("ABC")); | |
EXPECT_EQ("XY" WS "Z", normalizer.Normalize("xy z")); | |
EXPECT_EQ("γ", normalizer.Normalize("γ")); | |
EXPECT_EQ("abc", normalizer.Normalize("γγγ")); | |
EXPECT_EQ("abcγ", normalizer.Normalize("γγγγ")); | |
EXPECT_EQ("ABCabcD", normalizer.Normalize("abcγγγd")); | |
EXPECT_EQ("abcγ", normalizer.Normalize("γγγγγγ")); | |
} | |
static constexpr char kTestInputData[] = "nfkc.tsv"; | |
TEST(BuilderTest, LoadCharsMapTest) { | |
Builder::CharsMap chars_map; | |
ASSERT_TRUE( | |
Builder::LoadCharsMap( | |
util::JoinPath(::testing::SrcDir(), kTestInputData), | |
&chars_map) | |
.ok()); | |
std::string precompiled, expected; | |
ASSERT_TRUE(Builder::CompileCharsMap(chars_map, &precompiled).ok()); | |
// Round-trip. | |
Builder::CharsMap decompiled_chars_map; | |
ASSERT_TRUE( | |
Builder::DecompileCharsMap(precompiled, &decompiled_chars_map).ok()); | |
EXPECT_EQ(chars_map, decompiled_chars_map); | |
ASSERT_TRUE( | |
Builder::SaveCharsMap( | |
util::JoinPath(::testing::TempDir(), "output.tsv"), | |
chars_map) | |
.ok()); | |
Builder::CharsMap saved_chars_map; | |
ASSERT_TRUE( | |
Builder::LoadCharsMap( | |
util::JoinPath(::testing::TempDir(), "output.tsv"), | |
&saved_chars_map) | |
.ok()); | |
EXPECT_EQ(chars_map, saved_chars_map); | |
Builder::CharsMap nfkc_map; | |
ASSERT_TRUE(Builder::BuildNFKCMap(&nfkc_map).ok()); | |
ASSERT_TRUE(Builder::CompileCharsMap(nfkc_map, &expected).ok()); | |
} | |
TEST(BuilderTest, LoadCharsMapWithEmptyeTest) { | |
{ | |
auto output = filesystem::NewWritableFile( | |
util::JoinPath(::testing::TempDir(), "test.tsv")); | |
output->WriteLine("0061\t0041"); | |
output->WriteLine("0062"); | |
output->WriteLine("0063\t\t#foo=>bar"); | |
} | |
Builder::CharsMap chars_map; | |
EXPECT_TRUE(Builder::LoadCharsMap( | |
util::JoinPath(::testing::TempDir(), "test.tsv"), | |
&chars_map) | |
.ok()); | |
EXPECT_EQ(3, chars_map.size()); | |
EXPECT_EQ(std::vector<char32>({0x0041}), chars_map[{0x0061}]); | |
EXPECT_EQ(std::vector<char32>({}), chars_map[{0x0062}]); | |
EXPECT_EQ(std::vector<char32>({}), chars_map[{0x0063}]); | |
EXPECT_TRUE( | |
Builder::SaveCharsMap( | |
util::JoinPath(::testing::TempDir(), "test_out.tsv"), | |
chars_map) | |
.ok()); | |
Builder::CharsMap new_chars_map; | |
EXPECT_TRUE( | |
Builder::LoadCharsMap( | |
util::JoinPath(::testing::TempDir(), "test_out.tsv"), | |
&new_chars_map) | |
.ok()); | |
EXPECT_EQ(chars_map, new_chars_map); | |
} | |
TEST(BuilderTest, ContainsTooManySharedPrefixTest) { | |
Builder::CharsMap chars_map; | |
std::vector<char32> keys; | |
// chars_map contains too many shared prefix ("aaaa..."); | |
for (int i = 0; i < 100; ++i) { | |
keys.push_back('a'); | |
chars_map[keys] = {'b'}; | |
} | |
std::string output; | |
EXPECT_FALSE(Builder::CompileCharsMap(chars_map, &output).ok()); | |
} | |
} // namespace normalizer | |
} // namespace sentencepiece | |