P2A-test-NV / laser /utils /src /remove_non_printing_char.py
KuangDW
Add laser2.spm using Git LFS
05d3571
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# Remove non printable char as per:
# https://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python
#
# This is supposed to be a drop in replacement to moses strip-non-printing-char.perl
import sys
import unicodedata
def get_replacer(replace_by: str = " ") -> str:
non_printable_map = {
ord(c): replace_by
for c in (chr(i) for i in range(sys.maxunicode + 1))
# same as \p{C} in perl
# see https://www.unicode.org/reports/tr44/#General_Category_Values
if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
}
def replace_non_printing_char(line) -> str:
return line.translate(non_printable_map)
return replace_non_printing_char
def test_remove():
replaceby_ = get_replacer("_")
assert (
replaceby_("See what's hidden in your string… or be​hind")
== "See what's hidden in your string…_or be_hind_"
)
replacebyspace = get_replacer(" ")
assert replacebyspace("\x00\x11Hello\u200bWorld") == " Hello World"