Spaces:
Sleeping
Sleeping
// Copyright 2016 Google Inc. | |
// | |
// Licensed under the Apache License, Version 2.0 (the "License"); | |
// you may not use this file except in compliance with the License. | |
// You may obtain a copy of the License at | |
// | |
// http://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, software | |
// distributed under the License is distributed on an "AS IS" BASIS, | |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
// See the License for the specific language governing permissions and | |
// limitations under the License.! | |
namespace sentencepiece { | |
namespace pretokenizer { | |
class MockPretokenizer : public PretokenizerForTrainingInterface { | |
public: | |
MockPretokenizer() {} | |
~MockPretokenizer() {} | |
SentencePieceText Tokenize(absl::string_view text) const override { | |
return spt_; | |
} | |
util::Status status() const override { return util::OkStatus(); } | |
void SetOutput(const SentencePieceText &spt) { spt_ = spt; } | |
private: | |
SentencePieceText spt_; | |
}; | |
TEST(PretokenizerForTrainingTest, BaseTest) { | |
MockPretokenizer mock; | |
{ | |
SentencePieceText spt; | |
spt.set_text("I love sentencepiece"); | |
auto *p1 = spt.add_pieces(); | |
p1->set_surface("I"); | |
p1->set_begin(0); | |
p1->set_end(1); | |
auto *p2 = spt.add_pieces(); | |
p2->set_surface("love"); | |
p2->set_begin(2); | |
p2->set_end(6); | |
auto *p3 = spt.add_pieces(); | |
p3->set_surface("sentence"); | |
p3->set_begin(7); | |
p3->set_end(15); | |
auto *p4 = spt.add_pieces(); | |
p4->set_surface("piece"); | |
p4->set_begin(15); | |
p4->set_end(20); | |
mock.SetOutput(spt); | |
const auto expected = | |
absl::StrCat("I", TrainerInterface::kWSStr, "love", | |
TrainerInterface::kWSStr, "sentence||||piece"); | |
EXPECT_EQ(expected, | |
absl::StrJoin(mock.PreTokenize("I love sentencepiece"), "||||")); | |
} | |
{ | |
SentencePieceText spt; | |
spt.set_text("γγγ―γγ³γ§γ"); | |
auto *p1 = spt.add_pieces(); | |
p1->set_surface("γγ"); | |
p1->set_begin(0); | |
p1->set_end(6); | |
auto *p2 = spt.add_pieces(); | |
p2->set_surface("γ―"); | |
p2->set_begin(6); | |
p2->set_end(9); | |
auto *p3 = spt.add_pieces(); | |
p3->set_surface("γγ³"); | |
p3->set_begin(9); | |
p3->set_end(15); | |
auto *p4 = spt.add_pieces(); | |
p4->set_surface("γ§γ"); | |
p4->set_begin(15); | |
p4->set_end(21); | |
mock.SetOutput(spt); | |
const auto expected = "γγ||||γ―||||γγ³||||γ§γ"; | |
EXPECT_EQ(expected, | |
absl::StrJoin(mock.PreTokenize("γγγ―γγ³γ§γ"), "||||")); | |
} | |
} | |
} // namespace pretokenizer | |
} // namespace sentencepiece | |