File size: 5,997 Bytes
d66c48f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# Copyright (c) 2024 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import re
import jieba
import cn2an

"""

    Text clean time

"""
# List of (Latin alphabet, bopomofo) pairs:
_latin_to_bopomofo = [
    (re.compile("%s" % x[0], re.IGNORECASE), x[1])
    for x in [
        ("a", "γ„ŸΛ‰"),
        ("b", "γ„…γ„§Λ‹"),
        ("c", "γ„™γ„§Λ‰"),
        ("d", "ㄉㄧˋ"),
        ("e", "γ„§Λ‹"),
        ("f", "γ„ΛŠγ„ˆγ„¨Λ‹"),
        ("g", "ㄐㄧˋ"),
        ("h", "ㄝˇㄑㄩˋ"),
        ("i", "γ„žΛ‹"),
        ("j", "γ„γ„ŸΛ‹"),
        ("k", "γ„Žγ„ŸΛ‹"),
        ("l", "γ„ΛŠγ„›Λ‹"),
        ("m", "γ„ΛŠγ„‡γ„¨Λ‹"),
        ("n", "γ„£Λ‰"),
        ("o", "γ„‘Λ‰"),
        ("p", "ㄆㄧˉ"),
        ("q", "γ„Žγ„§γ„‘Λ‰"),
        ("r", "γ„šΛ‹"),
        ("s", "γ„ΛŠγ„™Λ‹"),
        ("t", "γ„Šγ„§Λ‹"),
        ("u", "γ„§γ„‘Λ‰"),
        ("v", "ㄨㄧˉ"),
        ("w", "γ„‰γ„šΛ‹γ„…γ„¨Λ‹γ„Œγ„§γ„‘Λ‹"),
        ("x", "γ„Λ‰γ„Žγ„¨Λ‹γ„™Λ‹"),
        ("y", "γ„¨γ„žΛ‹"),
        ("z", "γ„—γ„ŸΛ‹"),
    ]
]

# List of (bopomofo, ipa) pairs:
_bopomofo_to_ipa = [
    (re.compile("%s" % x[0]), x[1])
    for x in [
        ("γ„…γ„›", "p⁼wo"),
        ("ㄆㄛ", "pΚ°wo"),
        ("ㄇㄛ", "mwo"),
        ("γ„ˆγ„›", "fwo"),
        ("γ„§γ„’", "|jΙ›n"),
        ("γ„©γ„’", "|Ι₯Γ¦n"),
        ("γ„§γ„£", "|in"),
        ("γ„©γ„£", "|Ι₯n"),
        ("γ„§γ„₯", "|iΕ‹"),
        ("ㄨγ„₯", "|ΚŠΕ‹"),
        ("γ„©γ„₯", "|jΚŠΕ‹"),
        # Add
        ("γ„§γ„š", "|ia"),
        ("ㄧㄝ", "|iΙ›"),
        ("γ„§γ„ ", "|iΙ‘ΚŠ"),
        ("γ„§γ„‘", "|ioʊ"),
        ("γ„§γ„€", "|iΙ‘Ε‹"),
        ("γ„¨γ„š", "|ua"),
        ("ㄨㄛ", "|uo"),
        ("γ„¨γ„ž", "|uaΙͺ"),
        ("γ„¨γ„Ÿ", "|ueΙͺ"),
        ("ㄨㄒ", "|uan"),
        ("ㄨㄣ", "|uΙ™n"),
        ("ㄨ㄀", "|uΙ‘Ε‹"),
        ("ㄩㄝ", "|Ι₯Ι›"),
        # End
        ("γ„…", "p⁼"),
        ("ㄆ", "pΚ°"),
        ("ㄇ", "m"),
        ("γ„ˆ", "f"),
        ("ㄉ", "t⁼"),
        ("γ„Š", "tΚ°"),
        ("γ„‹", "n"),
        ("γ„Œ", "l"),
        ("ㄍ", "k⁼"),
        ("γ„Ž", "kΚ°"),
        ("ㄏ", "x"),
        ("ㄐ", "tʃ⁼"),
        ("ㄑ", "tʃʰ"),
        ("γ„’", "Κƒ"),
        ("γ„“", "ts`⁼"),
        ("γ„”", "ts`Κ°"),
        ("γ„•", "s`"),
        ("γ„–", "ΙΉ`"),
        ("γ„—", "ts⁼"),
        ("γ„˜", "tsΚ°"),
        ("γ„™", "|s"),
        ("γ„š", "|a"),
        ("γ„›", "|o"),
        ("γ„œ", "|Ι™"),
        ("ㄝ", "|Ι›"),
        ("γ„ž", "|aΙͺ"),
        ("γ„Ÿ", "|eΙͺ"),
        ("γ„ ", "|Ι‘ΚŠ"),
        ("γ„‘", "|oʊ"),
        ("γ„’", "|an"),
        ("γ„£", "|Ι™n"),
        ("γ„€", "|Ι‘Ε‹"),
        ("γ„₯", "|Ι™Ε‹"),
        ("ㄦ", "Ι™ΙΉ"),
        ("γ„§", "|i"),
        ("ㄨ", "|u"),
        ("γ„©", "|Ι₯"),
        ("Λ‰", "β†’|"),
        ("ˊ", "↑|"),
        ("Λ‡", "↓↑|"),
        ("Λ‹", "↓|"),
        ("Λ™", "|"),
    ]
]


# Convert numbers to Chinese pronunciation
def number_to_chinese(text):
    # numbers = re.findall(r'\d+(?:\.?\d+)?', text)
    # for number in numbers:
    #     text = text.replace(number, cn2an.an2cn(number), 1)
    text = cn2an.transform(text, "an2cn")
    return text


def normalization(text):
    text = text.replace(",", ",")
    text = text.replace("。", ".")
    text = text.replace("!", "!")
    text = text.replace("?", "?")
    text = text.replace("οΌ›", ";")
    text = text.replace(":", ":")
    text = text.replace("、", ",")
    text = text.replace("β€˜", "'")
    text = text.replace("’", "'")
    text = text.replace("β‹―", "…")
    text = text.replace("Β·Β·Β·", "…")
    text = text.replace("・・・", "…")
    text = text.replace("...", "…")
    text = re.sub(r"\s+", "", text)
    text = re.sub(r"[^\u4e00-\u9fff\s_,\.\?!;:\'…]", "", text)
    text = re.sub(r"\s*([,\.\?!;:\'…])\s*", r"\1", text)
    return text


# Word Segmentation, and convert Chinese pronunciation to pinyin (bopomofo)
def chinese_to_bopomofo(text):
    from pypinyin import lazy_pinyin, BOPOMOFO

    words = jieba.lcut(text, cut_all=False)
    text = ""
    for word in words:
        bopomofos = lazy_pinyin(word, BOPOMOFO)
        if not re.search("[\u4e00-\u9fff]", word):
            text += word
            continue
        for i in range(len(bopomofos)):
            bopomofos[i] = re.sub(r"([\u3105-\u3129])$", r"\1Λ‰", bopomofos[i])
        if text != "":
            text += "|"
        text += "|".join(bopomofos)
    return text


# Convert latin pronunciation to pinyin (bopomofo)
def latin_to_bopomofo(text):
    for regex, replacement in _latin_to_bopomofo:
        text = re.sub(regex, replacement, text)
    return text


# Convert pinyin (bopomofo) to IPA
def bopomofo_to_ipa(text):
    for regex, replacement in _bopomofo_to_ipa:
        text = re.sub(regex, replacement, text)
    return text


def _chinese_to_ipa(text):
    text = number_to_chinese(text.strip())
    text = normalization(text)
    # print("Normalized text: ", text)
    text = chinese_to_bopomofo(text)
    text = latin_to_bopomofo(text)
    text = bopomofo_to_ipa(text)
    text = re.sub("([sΙΉ]`[⁼ʰ]?)([→↓↑ ]+|$)", r"\1ΙΉ\2", text)
    text = re.sub("([s][⁼ʰ]?)([→↓↑ ]+|$)", r"\1ΙΉ\2", text)
    text = re.sub(r"^\||[^\w\s_,\.\?!;:\'…\|→↓↑⁼ʰ`]", "", text)
    text = re.sub(r"([,\.\?!;:\'…])", r"|\1|", text)
    text = re.sub(r"\|+", "|", text)
    text = text.rstrip("|")
    return text


# Convert Chinese to IPA
def chinese_to_ipa(text, text_tokenizer):
    # phonemes = text_tokenizer(text.strip())
    if type(text) == str:
        return _chinese_to_ipa(text)
    else:
        result_ph = []
        for t in text:
            result_ph.append(_chinese_to_ipa(t))
        return result_ph