Spaces:
Sleeping
Sleeping
# Copyright (c) Facebook, Inc. and its affiliates. | |
# All rights reserved. | |
# | |
# This source code is licensed under the license found in the | |
# LICENSE file in the root directory of this source tree. | |
# Remove non printable char as per: | |
# https://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python | |
# | |
# This is supposed to be a drop in replacement to moses strip-non-printing-char.perl | |
import sys | |
import unicodedata | |
def get_replacer(replace_by: str = " ") -> str: | |
non_printable_map = { | |
ord(c): replace_by | |
for c in (chr(i) for i in range(sys.maxunicode + 1)) | |
# same as \p{C} in perl | |
# see https://www.unicode.org/reports/tr44/#General_Category_Values | |
if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"} | |
} | |
def replace_non_printing_char(line) -> str: | |
return line.translate(non_printable_map) | |
return replace_non_printing_char | |
def test_remove(): | |
replaceby_ = get_replacer("_") | |
assert ( | |
replaceby_("See what's hidden in your string… or behind") | |
== "See what's hidden in your string…_or be_hind_" | |
) | |
replacebyspace = get_replacer(" ") | |
assert replacebyspace("\x00\x11Hello\u200bWorld") == " Hello World" | |