Spaces:
Sleeping
Sleeping
// Copyright 2016 Google Inc. | |
// | |
// Licensed under the Apache License, Version 2.0 (the "License"); | |
// you may not use this file except in compliance with the License. | |
// You may obtain a copy of the License at | |
// | |
// http://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, software | |
// distributed under the License is distributed on an "AS IS" BASIS, | |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
// See the License for the specific language governing permissions and | |
// limitations under the License.! | |
using sentencepiece::normalizer::Builder; | |
ABSL_FLAG(bool, output_precompiled_header, false, | |
"make normalization_rule.h file"); | |
namespace sentencepiece { | |
namespace { | |
std::string ToHexUInt64Array( | |
const std::vector<std::pair<std::string, std::string>> &data, | |
std::vector<size_t> *offset) { | |
std::stringstream os; | |
os.setf(std::ios_base::hex, std::ios_base::basefield); | |
os.setf(std::ios_base::uppercase); | |
os.setf(std::ios_base::right); | |
os.fill('0'); | |
os.unsetf(std::ios_base::showbase); | |
size_t num = 0; | |
for (const auto &p : data) { | |
const char *begin = p.second.data(); | |
const char *end = p.second.data() + p.second.size(); | |
offset->push_back(num); | |
while (begin < end) { | |
unsigned long long int n = 0; | |
unsigned char *buf = reinterpret_cast<unsigned char *>(&n); | |
const size_t size = std::min<size_t>(end - begin, sizeof(n)); | |
for (size_t i = 0; i < size; ++i) { | |
buf[i] = static_cast<unsigned char>(begin[i]); | |
} | |
begin += sizeof(n); | |
os << "0x" << std::setw(2 * sizeof(n)) << n << ", "; | |
if (++num % 8 == 0) { | |
os << "\n"; | |
} | |
} | |
} | |
return os.str(); | |
} | |
std::string ToHexData(absl::string_view data) { | |
const char *begin = data.data(); | |
const char *end = data.data() + data.size(); | |
constexpr char kHex[] = "0123456789ABCDEF"; | |
constexpr size_t kNumOfBytesOnOneLine = 20; | |
size_t output_count = 0; | |
std::stringstream os; | |
while (begin < end) { | |
const size_t bucket_size = | |
std::min<size_t>(end - begin, kNumOfBytesOnOneLine - | |
output_count % kNumOfBytesOnOneLine); | |
if (output_count % kNumOfBytesOnOneLine == 0 && bucket_size > 0) { | |
os << "\""; | |
} | |
for (size_t i = 0; i < bucket_size; ++i) { | |
os << "\\x" << kHex[(*begin & 0xF0) >> 4] << kHex[(*begin & 0x0F) >> 0]; | |
++begin; | |
} | |
output_count += bucket_size; | |
if (output_count % kNumOfBytesOnOneLine == 0 && bucket_size > 0 && | |
begin < end) { | |
os << "\"\n"; | |
} | |
} | |
os << "\"\n"; | |
return os.str(); | |
} | |
std::string MakeHeader( | |
const std::vector<std::pair<std::string, std::string>> &data) { | |
constexpr char kHeader[] = | |
R"(#ifndef NORMALIZATION_RULE_H_ | |
#define NORMALIZATION_RULE_H_ | |
#include <cstdio> | |
namespace sentencepiece { | |
namespace { | |
struct BinaryBlob { | |
const char *name; | |
size_t size; | |
const char *data; | |
}; | |
)"; | |
constexpr char kFooter[] = R"( | |
} // namespace | |
} // namespace sentencepiece | |
#endif // NORMALIZATION_RULE_H_ | |
)"; | |
std::stringstream os; | |
os << kHeader; | |
os << "#if defined(_WIN32) && !defined(__CYGWIN__)\n"; | |
os << "constexpr unsigned long long int kNormalizationRules_blob_uint64[] = " | |
"{\n"; | |
std::vector<size_t> offset; | |
os << ToHexUInt64Array(data, &offset); | |
CHECK_EQ(offset.size(), data.size()); | |
os << "};\n\n"; | |
os << "const BinaryBlob kNormalizationRules_blob[] = {\n"; | |
for (size_t i = 0; i < data.size(); ++i) { | |
os << "{ \"" << data[i].first << "\", " << data[i].second.size() << ", "; | |
os << "reinterpret_cast<const char *>(kNormalizationRules_blob_uint64 + " | |
<< offset[i] << ") },\n"; | |
} | |
os << "};\n"; | |
os << "#else\n"; | |
os << "constexpr BinaryBlob kNormalizationRules_blob[] = {\n"; | |
for (size_t i = 0; i < data.size(); ++i) { | |
os << "{ \"" << data[i].first << "\", " << data[i].second.size() << ", "; | |
os << ToHexData(data[i].second) << "},\n"; | |
} | |
os << "};\n"; | |
os << "#endif\n"; | |
os << "constexpr size_t kNormalizationRules_size = " << data.size() << ";\n"; | |
os << kFooter; | |
return os.str(); | |
} | |
} // namespace | |
} // namespace sentencepiece | |
int main(int argc, char **argv) { | |
sentencepiece::ScopedResourceDestructor cleaner; | |
sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); | |
const std::vector<std::pair< | |
std::string, | |
std::function<sentencepiece::util::Status(Builder::CharsMap *)>>> | |
kRuleList = {{"nfkc", Builder::BuildNFKCMap}, | |
{"nmt_nfkc", Builder::BuildNmtNFKCMap}, | |
{"nfkc_cf", Builder::BuildNFKC_CFMap}, | |
{"nmt_nfkc_cf", Builder::BuildNmtNFKC_CFMap}, | |
{"nfkd", Builder::BuildNFKDMap}, | |
{"nfc", Builder::BuildNFCMap}, | |
{"nfd", Builder::BuildNFDMap}, | |
{"nfkd_cf", Builder::BuildNFKD_CFMap}, | |
{"nfc_cf", Builder::BuildNFC_CFMap}, | |
{"nfd_cf", Builder::BuildNFD_CFMap}}; | |
std::vector<std::pair<std::string, std::string>> data; | |
for (const auto &p : kRuleList) { | |
Builder::CharsMap normalized_map; | |
CHECK_OK(p.second(&normalized_map)); | |
// Write Header. | |
std::string index; | |
CHECK_OK(Builder::CompileCharsMap(normalized_map, &index)); | |
// Write TSV file. | |
CHECK_OK(Builder::SaveCharsMap(p.first + ".tsv", normalized_map)); | |
// Do not make NFKD map as it is optionally created. | |
if (p.first == "nfkd" || p.first == "nfd" || p.first == "nfc" || | |
p.first == "nfkd_cf" || p.first == "nfd_cf" || p.first == "nfc_cf") { | |
continue; | |
} | |
data.emplace_back(p.first, index); | |
} | |
if (absl::GetFlag(FLAGS_output_precompiled_header)) { | |
constexpr char kPrecompiledHeaderFileName[] = "normalization_rule.h"; | |
auto output = | |
sentencepiece::filesystem::NewWritableFile(kPrecompiledHeaderFileName); | |
CHECK_OK(output->status()); | |
output->Write(sentencepiece::MakeHeader(data)); | |
} | |
return 0; | |
} | |