Spaces:

naonauno
/

dialogs2-factory

Paused

File size: 3,942 Bytes

d66c48f

"""https://github.com/bootphon/phonemizer"""

import re

# from g2pkk import G2p
# from jamo import hangul_to_jamo

english_dictionary = {
    "KOREA": "코리아",
    "IDOL": "아이돌",
    "IT": "아이티",
    "IQ": "아이큐",
    "UP": "업",
    "DOWN": "다운",
    "PC": "피씨",
    "CCTV": "씨씨티비",
    "SNS": "에스엔에스",
    "AI": "에이아이",
    "CEO": "씨이오",
    "A": "에이",
    "B": "비",
    "C": "씨",
    "D": "디",
    "E": "이",
    "F": "에프",
    "G": "지",
    "H": "에이치",
    "I": "아이",
    "J": "제이",
    "K": "케이",
    "L": "엘",
    "M": "엠",
    "N": "엔",
    "O": "오",
    "P": "피",
    "Q": "큐",
    "R": "알",
    "S": "에스",
    "T": "티",
    "U": "유",
    "V": "브이",
    "W": "더블유",
    "X": "엑스",
    "Y": "와이",
    "Z": "제트",
}

# List of (jamo, ipa) pairs: (need to update)
_jamo_to_ipa = [
    (re.compile("%s" % x[0]), x[1])
    for x in [
        ("ㅏ", "ɐ"),
        ("ㅑ", "jɐ"),
        ("ㅓ", "ʌ"),
        ("ㅕ", "jʌ"),
        ("ㅗ", "o"),
        ("ㅛ", "jo"),
        ("ᅮ", "u"),
        ("ㅠ", "ju"),
        ("ᅳ", "ɯ"),
        ("ㅣ", "i"),
        ("ㅔ", "e"),
        ("ㅐ", "ɛ"),
        ("ㅖ", "je"),
        ("ㅒ", "jɛ"),  # lost
        ("ㅚ", "we"),
        ("ㅟ", "wi"),
        ("ㅢ", "ɯj"),
        ("ㅘ", "wɐ"),
        ("ㅙ", "wɛ"),  # lost
        ("ㅝ", "wʌ"),
        ("ㅞ", "wɛ"),  # lost
        ("ㄱ", "q"),  # 'ɡ' or 'k'
        ("ㄴ", "n"),
        ("ㄷ", "t"),  # d
        ("ㄹ", "ɫ"),  # 'ᄅ' is 'r', 'ᆯ' is 'ɫ'
        ("ㅁ", "m"),
        ("ㅂ", "p"),
        ("ㅅ", "s"),  # 'ᄉ'is 't', 'ᆺ'is 's'
        ("ㅇ", "ŋ"),  # 'ᄋ' is None, 'ᆼ' is 'ŋ'
        ("ㅈ", "tɕ"),
        ("ㅊ", "tɕʰ"),  # tʃh
        ("ㅋ", "kʰ"),  # kh
        ("ㅌ", "tʰ"),  # th
        ("ㅍ", "pʰ"),  # ph
        ("ㅎ", "h"),
        ("ㄲ", "k*"),  # q
        ("ㄸ", "t*"),  # t
        ("ㅃ", "p*"),  # p
        ("ㅆ", "s*"),  # 'ᄊ' is 's', 'ᆻ' is 't'
        ("ㅉ", "tɕ*"),  # tɕ ?
    ]
]

_special_map = [
    (re.compile("%s" % x[0]), x[1])
    for x in [
        ("ʃ", "ɕ"),
        ("tɕh", "tɕʰ"),
        ("kh", "kʰ"),
        ("th", "tʰ"),
        ("ph", "pʰ"),
    ]
]


def normalize(text):
    text = text.strip()
    text = re.sub(
        "[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text
    )
    text = normalize_english(text)
    text = text.lower()
    return text


def normalize_english(text):
    def fn(m):
        word = m.group()
        if word in english_dictionary:
            return english_dictionary.get(word)
        return word

    text = re.sub("([A-Za-z]+)", fn, text)
    return text


# Convert jamo to IPA
def jamo_to_ipa(text):
    res = ""
    for t in text:
        for regex, replacement in _jamo_to_ipa:
            t = re.sub(regex, replacement, t)
        res += t
    return res


# special map
def special_map(text):
    for regex, replacement in _special_map:
        text = re.sub(regex, replacement, text)
    return text


def korean_to_ipa(text):
    text = normalize(text)

    # espeak-ng
    from phonemizer import phonemize
    from phonemizer.separator import Separator

    ipa = phonemize(
        text,
        language="ko",
        backend="espeak",
        separator=Separator(phone=None, word=" ", syllable="|"),
        strip=True,
        preserve_punctuation=True,
        njobs=4,
    )
    ipa = special_map(ipa)
    # # hangul charactier
    # g2p = G2p()
    # text = g2p(text)
    # text = list(hangul_to_jamo(text))  # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ']
    # ipa = jamo_to_ipa(text)
    return ipa