File size: 2,083 Bytes
d66c48f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Copyright (c) 2024 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import re
from utils.g2p.japanese import japanese_to_ipa
from utils.g2p.mandarin import chinese_to_ipa
from utils.g2p.english import english_to_ipa
from utils.g2p.french import french_to_ipa
from utils.g2p.korean import korean_to_ipa
from utils.g2p.german import german_to_ipa

patterns = [
    r"\[EN\](.*?)\[EN\]",
    r"\[ZH\](.*?)\[ZH\]",
    r"\[JA\](.*?)\[JA\]",
    r"\[FR\](.*?)\[FR\]",
    r"\[KR\](.*?)\[KR\]",
    r"\[DE\](.*?)\[DE\]",
]


def cje_cleaners(text):
    matches = []
    for pattern in patterns:
        matches.extend(re.finditer(pattern, text))

    matches.sort(key=lambda x: x.start())  # Sort matches by their start positions

    outputs = ""
    for match in matches:
        text_segment = text[match.start() : match.end()]
        phone = clean_one(text_segment)
        outputs += phone

    return outputs


def clean_one(text):
    if text.find("[ZH]") != -1:
        text = re.sub(
            r"\[ZH\](.*?)\[ZH\]", lambda x: chinese_to_ipa(x.group(1)) + " ", text
        )
    if text.find("[JA]") != -1:
        text = re.sub(
            r"\[JA\](.*?)\[JA\]", lambda x: japanese_to_ipa(x.group(1)) + " ", text
        )
    if text.find("[EN]") != -1:
        text = re.sub(
            r"\[EN\](.*?)\[EN\]", lambda x: english_to_ipa(x.group(1)) + " ", text
        )
    if text.find("[FR]") != -1:
        text = re.sub(
            r"\[FR\](.*?)\[FR\]", lambda x: french_to_ipa(x.group(1)) + " ", text
        )
    if text.find("[KR]") != -1:
        text = re.sub(
            r"\[KR\](.*?)\[KR\]", lambda x: korean_to_ipa(x.group(1)) + " ", text
        )
    if text.find("[DE]") != -1:
        text = re.sub(
            r"\[DE\](.*?)\[DE\]", lambda x: german_to_ipa(x.group(1)) + " ", text
        )
    text = re.sub(r"\s+$", "", text)
    text = re.sub(r"([^\.,!\?\-…~])$", r"\1.", text)
    return text