File size: 3,942 Bytes
d66c48f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
"""https://github.com/bootphon/phonemizer"""

import re

# from g2pkk import G2p
# from jamo import hangul_to_jamo

english_dictionary = {
    "KOREA": "코리아",
    "IDOL": "아이돌",
    "IT": "아이티",
    "IQ": "아이큐",
    "UP": "업",
    "DOWN": "다운",
    "PC": "피씨",
    "CCTV": "씨씨티비",
    "SNS": "에스엔에스",
    "AI": "에이아이",
    "CEO": "씨이오",
    "A": "에이",
    "B": "비",
    "C": "씨",
    "D": "디",
    "E": "이",
    "F": "에프",
    "G": "지",
    "H": "에이치",
    "I": "아이",
    "J": "제이",
    "K": "케이",
    "L": "엘",
    "M": "엠",
    "N": "엔",
    "O": "오",
    "P": "피",
    "Q": "큐",
    "R": "알",
    "S": "에스",
    "T": "티",
    "U": "유",
    "V": "브이",
    "W": "더블유",
    "X": "엑스",
    "Y": "와이",
    "Z": "제트",
}

# List of (jamo, ipa) pairs: (need to update)
_jamo_to_ipa = [
    (re.compile("%s" % x[0]), x[1])
    for x in [
        ("ㅏ", "ɐ"),
        ("ㅑ", "jɐ"),
        ("ㅓ", "ʌ"),
        ("ㅕ", "jʌ"),
        ("ㅗ", "o"),
        ("ㅛ", "jo"),
        ("ᅮ", "u"),
        ("ㅠ", "ju"),
        ("ᅳ", "ɯ"),
        ("ㅣ", "i"),
        ("ㅔ", "e"),
        ("ㅐ", "ɛ"),
        ("ㅖ", "je"),
        ("ㅒ", "jɛ"),  # lost
        ("ㅚ", "we"),
        ("ㅟ", "wi"),
        ("ㅢ", "ɯj"),
        ("ㅘ", "wɐ"),
        ("ㅙ", "wɛ"),  # lost
        ("ㅝ", "wʌ"),
        ("ㅞ", "wɛ"),  # lost
        ("ㄱ", "q"),  # 'ɡ' or 'k'
        ("ㄴ", "n"),
        ("ㄷ", "t"),  # d
        ("ㄹ", "ɫ"),  # 'ᄅ' is 'r', 'ᆯ' is 'ɫ'
        ("ㅁ", "m"),
        ("ㅂ", "p"),
        ("ㅅ", "s"),  # 'ᄉ'is 't', 'ᆺ'is 's'
        ("ㅇ", "ŋ"),  # 'ᄋ' is None, 'ᆼ' is 'ŋ'
        ("ㅈ", "tɕ"),
        ("ㅊ", "tɕʰ"),  # tʃh
        ("ㅋ", "kʰ"),  # kh
        ("ㅌ", "tʰ"),  # th
        ("ㅍ", "pʰ"),  # ph
        ("ㅎ", "h"),
        ("ㄲ", "k*"),  # q
        ("ㄸ", "t*"),  # t
        ("ㅃ", "p*"),  # p
        ("ㅆ", "s*"),  # 'ᄊ' is 's', 'ᆻ' is 't'
        ("ㅉ", "tɕ*"),  # tɕ ?
    ]
]

_special_map = [
    (re.compile("%s" % x[0]), x[1])
    for x in [
        ("ʃ", "ɕ"),
        ("tɕh", "tɕʰ"),
        ("kh", "kʰ"),
        ("th", "tʰ"),
        ("ph", "pʰ"),
    ]
]


def normalize(text):
    text = text.strip()
    text = re.sub(
        "[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text
    )
    text = normalize_english(text)
    text = text.lower()
    return text


def normalize_english(text):
    def fn(m):
        word = m.group()
        if word in english_dictionary:
            return english_dictionary.get(word)
        return word

    text = re.sub("([A-Za-z]+)", fn, text)
    return text


# Convert jamo to IPA
def jamo_to_ipa(text):
    res = ""
    for t in text:
        for regex, replacement in _jamo_to_ipa:
            t = re.sub(regex, replacement, t)
        res += t
    return res


# special map
def special_map(text):
    for regex, replacement in _special_map:
        text = re.sub(regex, replacement, text)
    return text


def korean_to_ipa(text):
    text = normalize(text)

    # espeak-ng
    from phonemizer import phonemize
    from phonemizer.separator import Separator

    ipa = phonemize(
        text,
        language="ko",
        backend="espeak",
        separator=Separator(phone=None, word=" ", syllable="|"),
        strip=True,
        preserve_punctuation=True,
        njobs=4,
    )
    ipa = special_map(ipa)
    # # hangul charactier
    # g2p = G2p()
    # text = g2p(text)
    # text = list(hangul_to_jamo(text))  # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ']
    # ipa = jamo_to_ipa(text)
    return ipa