File size: 1,270 Bytes
05d3571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

# Remove non printable char as per:
#  https://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python
#
# This is supposed to be a drop in replacement to moses strip-non-printing-char.perl

import sys
import unicodedata


def get_replacer(replace_by: str = " ") -> str:
    non_printable_map = {
        ord(c): replace_by
        for c in (chr(i) for i in range(sys.maxunicode + 1))
        # same as \p{C} in perl
        # see https://www.unicode.org/reports/tr44/#General_Category_Values
        if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
    }

    def replace_non_printing_char(line) -> str:
        return line.translate(non_printable_map)

    return replace_non_printing_char


def test_remove():
    replaceby_ = get_replacer("_")

    assert (
        replaceby_("See what's hidden in your string…	or be​hind")
        == "See what's hidden in your string…_or be_hind_"
    )

    replacebyspace = get_replacer(" ")

    assert replacebyspace("\x00\x11Hello\u200bWorld") == "  Hello World"