naonauno's picture
Upload 855 files
d66c48f verified
"""https://github.com/bootphon/phonemizer"""
import re
# from g2pkk import G2p
# from jamo import hangul_to_jamo
english_dictionary = {
"KOREA": "코리아",
"IDOL": "아이돌",
"IT": "아이티",
"IQ": "아이큐",
"UP": "업",
"DOWN": "다운",
"PC": "피씨",
"CCTV": "씨씨티비",
"SNS": "에스엔에스",
"AI": "에이아이",
"CEO": "씨이오",
"A": "에이",
"B": "비",
"C": "씨",
"D": "디",
"E": "이",
"F": "에프",
"G": "지",
"H": "에이치",
"I": "아이",
"J": "제이",
"K": "케이",
"L": "엘",
"M": "엠",
"N": "엔",
"O": "오",
"P": "피",
"Q": "큐",
"R": "알",
"S": "에스",
"T": "티",
"U": "유",
"V": "브이",
"W": "더블유",
"X": "엑스",
"Y": "와이",
"Z": "제트",
}
# List of (jamo, ipa) pairs: (need to update)
_jamo_to_ipa = [
(re.compile("%s" % x[0]), x[1])
for x in [
("ㅏ", "ɐ"),
("ㅑ", "jɐ"),
("ㅓ", "ʌ"),
("ㅕ", "jʌ"),
("ㅗ", "o"),
("ㅛ", "jo"),
("ᅮ", "u"),
("ㅠ", "ju"),
("ᅳ", "ɯ"),
("ㅣ", "i"),
("ㅔ", "e"),
("ㅐ", "ɛ"),
("ㅖ", "je"),
("ㅒ", "jɛ"), # lost
("ㅚ", "we"),
("ㅟ", "wi"),
("ㅢ", "ɯj"),
("ㅘ", "wɐ"),
("ㅙ", "wɛ"), # lost
("ㅝ", "wʌ"),
("ㅞ", "wɛ"), # lost
("ㄱ", "q"), # 'ɡ' or 'k'
("ㄴ", "n"),
("ㄷ", "t"), # d
("ㄹ", "ɫ"), # 'ᄅ' is 'r', 'ᆯ' is 'ɫ'
("ㅁ", "m"),
("ㅂ", "p"),
("ㅅ", "s"), # 'ᄉ'is 't', 'ᆺ'is 's'
("ㅇ", "ŋ"), # 'ᄋ' is None, 'ᆼ' is 'ŋ'
("ㅈ", "tɕ"),
("ㅊ", "tɕʰ"), # tʃh
("ㅋ", "kʰ"), # kh
("ㅌ", "tʰ"), # th
("ㅍ", "pʰ"), # ph
("ㅎ", "h"),
("ㄲ", "k*"), # q
("ㄸ", "t*"), # t
("ㅃ", "p*"), # p
("ㅆ", "s*"), # 'ᄊ' is 's', 'ᆻ' is 't'
("ㅉ", "tɕ*"), # tɕ ?
]
]
_special_map = [
(re.compile("%s" % x[0]), x[1])
for x in [
("ʃ", "ɕ"),
("tɕh", "tɕʰ"),
("kh", "kʰ"),
("th", "tʰ"),
("ph", "pʰ"),
]
]
def normalize(text):
text = text.strip()
text = re.sub(
"[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text
)
text = normalize_english(text)
text = text.lower()
return text
def normalize_english(text):
def fn(m):
word = m.group()
if word in english_dictionary:
return english_dictionary.get(word)
return word
text = re.sub("([A-Za-z]+)", fn, text)
return text
# Convert jamo to IPA
def jamo_to_ipa(text):
res = ""
for t in text:
for regex, replacement in _jamo_to_ipa:
t = re.sub(regex, replacement, t)
res += t
return res
# special map
def special_map(text):
for regex, replacement in _special_map:
text = re.sub(regex, replacement, text)
return text
def korean_to_ipa(text):
text = normalize(text)
# espeak-ng
from phonemizer import phonemize
from phonemizer.separator import Separator
ipa = phonemize(
text,
language="ko",
backend="espeak",
separator=Separator(phone=None, word=" ", syllable="|"),
strip=True,
preserve_punctuation=True,
njobs=4,
)
ipa = special_map(ipa)
# # hangul charactier
# g2p = G2p()
# text = g2p(text)
# text = list(hangul_to_jamo(text)) # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ']
# ipa = jamo_to_ipa(text)
return ipa