Spaces:

naonauno
/

dialogs2-factory

Paused

App Files Files Community

dialogs2-factory / Amphion /models /tts /debatts /utils /g2p /korean.py

naonauno

Upload 855 files

d66c48f verified 4 months ago

raw

history blame contribute delete

3.94 kB

	"""https://github.com/bootphon/phonemizer"""

	import re

	# from g2pkk import G2p
	# from jamo import hangul_to_jamo

	english_dictionary = {
	"KOREA": "코리아",
	"IDOL": "아이돌",
	"IT": "아이티",
	"IQ": "아이큐",
	"UP": "업",
	"DOWN": "다운",
	"PC": "피씨",
	"CCTV": "씨씨티비",
	"SNS": "에스엔에스",
	"AI": "에이아이",
	"CEO": "씨이오",
	"A": "에이",
	"B": "비",
	"C": "씨",
	"D": "디",
	"E": "이",
	"F": "에프",
	"G": "지",
	"H": "에이치",
	"I": "아이",
	"J": "제이",
	"K": "케이",
	"L": "엘",
	"M": "엠",
	"N": "엔",
	"O": "오",
	"P": "피",
	"Q": "큐",
	"R": "알",
	"S": "에스",
	"T": "티",
	"U": "유",
	"V": "브이",
	"W": "더블유",
	"X": "엑스",
	"Y": "와이",
	"Z": "제트",
	}

	# List of (jamo, ipa) pairs: (need to update)
	_jamo_to_ipa = [
	(re.compile("%s" % x[0]), x[1])
	for x in [
	("ㅏ", "ɐ"),
	("ㅑ", "jɐ"),
	("ㅓ", "ʌ"),
	("ㅕ", "jʌ"),
	("ㅗ", "o"),
	("ㅛ", "jo"),
	("ᅮ", "u"),
	("ㅠ", "ju"),
	("ᅳ", "ɯ"),
	("ㅣ", "i"),
	("ㅔ", "e"),
	("ㅐ", "ɛ"),
	("ㅖ", "je"),
	("ㅒ", "jɛ"), # lost
	("ㅚ", "we"),
	("ㅟ", "wi"),
	("ㅢ", "ɯj"),
	("ㅘ", "wɐ"),
	("ㅙ", "wɛ"), # lost
	("ㅝ", "wʌ"),
	("ㅞ", "wɛ"), # lost
	("ㄱ", "q"), # 'ɡ' or 'k'
	("ㄴ", "n"),
	("ㄷ", "t"), # d
	("ㄹ", "ɫ"), # 'ᄅ' is 'r', 'ᆯ' is 'ɫ'
	("ㅁ", "m"),
	("ㅂ", "p"),
	("ㅅ", "s"), # 'ᄉ'is 't', 'ᆺ'is 's'
	("ㅇ", "ŋ"), # 'ᄋ' is None, 'ᆼ' is 'ŋ'
	("ㅈ", "tɕ"),
	("ㅊ", "tɕʰ"), # tʃh
	("ㅋ", "kʰ"), # kh
	("ㅌ", "tʰ"), # th
	("ㅍ", "pʰ"), # ph
	("ㅎ", "h"),
	("ㄲ", "k*"), # q
	("ㄸ", "t*"), # t
	("ㅃ", "p*"), # p
	("ㅆ", "s*"), # 'ᄊ' is 's', 'ᆻ' is 't'
	("ㅉ", "tɕ*"), # tɕ ?
	]
	]

	_special_map = [
	(re.compile("%s" % x[0]), x[1])
	for x in [
	("ʃ", "ɕ"),
	("tɕh", "tɕʰ"),
	("kh", "kʰ"),
	("th", "tʰ"),
	("ph", "pʰ"),
	]
	]


	def normalize(text):
	text = text.strip()
	text = re.sub(
	"[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text
	)
	text = normalize_english(text)
	text = text.lower()
	return text


	def normalize_english(text):
	def fn(m):
	word = m.group()
	if word in english_dictionary:
	return english_dictionary.get(word)
	return word

	text = re.sub("([A-Za-z]+)", fn, text)
	return text


	# Convert jamo to IPA
	def jamo_to_ipa(text):
	res = ""
	for t in text:
	for regex, replacement in _jamo_to_ipa:
	t = re.sub(regex, replacement, t)
	res += t
	return res


	# special map
	def special_map(text):
	for regex, replacement in _special_map:
	text = re.sub(regex, replacement, text)
	return text


	def korean_to_ipa(text):
	text = normalize(text)

	# espeak-ng
	from phonemizer import phonemize
	from phonemizer.separator import Separator

	ipa = phonemize(
	text,
	language="ko",
	backend="espeak",
	separator=Separator(phone=None, word=" ", syllable="\|"),
	strip=True,
	preserve_punctuation=True,
	njobs=4,
	)
	ipa = special_map(ipa)
	# # hangul charactier
	# g2p = G2p()
	# text = g2p(text)
	# text = list(hangul_to_jamo(text)) # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ']
	# ipa = jamo_to_ipa(text)
	return ipa