Spaces:
Paused
Paused
# Copyright (c) 2024 Amphion. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
import re | |
import jieba | |
import cn2an | |
""" | |
Text clean time | |
""" | |
# List of (Latin alphabet, bopomofo) pairs: | |
_latin_to_bopomofo = [ | |
(re.compile("%s" % x[0], re.IGNORECASE), x[1]) | |
for x in [ | |
("a", "γΛ"), | |
("b", "γ γ§Λ"), | |
("c", "γγ§Λ"), | |
("d", "γγ§Λ"), | |
("e", "γ§Λ"), | |
("f", "γΛγγ¨Λ"), | |
("g", "γγ§Λ"), | |
("h", "γΛγγ©Λ"), | |
("i", "γΛ"), | |
("j", "γγΛ"), | |
("k", "γγΛ"), | |
("l", "γΛγΛ"), | |
("m", "γΛγγ¨Λ"), | |
("n", "γ£Λ"), | |
("o", "γ‘Λ"), | |
("p", "γγ§Λ"), | |
("q", "γγ§γ‘Λ"), | |
("r", "γΛ"), | |
("s", "γΛγΛ"), | |
("t", "γγ§Λ"), | |
("u", "γ§γ‘Λ"), | |
("v", "γ¨γ§Λ"), | |
("w", "γγΛγ γ¨Λγγ§γ‘Λ"), | |
("x", "γΛγγ¨ΛγΛ"), | |
("y", "γ¨γΛ"), | |
("z", "γγΛ"), | |
] | |
] | |
# List of (bopomofo, ipa) pairs: | |
_bopomofo_to_ipa = [ | |
(re.compile("%s" % x[0]), x[1]) | |
for x in [ | |
("γ γ", "pβΌwo"), | |
("γγ", "pΚ°wo"), | |
("γγ", "mwo"), | |
("γγ", "fwo"), | |
("γ§γ’", "|jΙn"), | |
("γ©γ’", "|Ι₯Γ¦n"), | |
("γ§γ£", "|in"), | |
("γ©γ£", "|Ι₯n"), | |
("γ§γ₯", "|iΕ"), | |
("γ¨γ₯", "|ΚΕ"), | |
("γ©γ₯", "|jΚΕ"), | |
# Add | |
("γ§γ", "|ia"), | |
("γ§γ", "|iΙ"), | |
("γ§γ ", "|iΙΚ"), | |
("γ§γ‘", "|ioΚ"), | |
("γ§γ€", "|iΙΕ"), | |
("γ¨γ", "|ua"), | |
("γ¨γ", "|uo"), | |
("γ¨γ", "|uaΙͺ"), | |
("γ¨γ", "|ueΙͺ"), | |
("γ¨γ’", "|uan"), | |
("γ¨γ£", "|uΙn"), | |
("γ¨γ€", "|uΙΕ"), | |
("γ©γ", "|Ι₯Ι"), | |
# End | |
("γ ", "pβΌ"), | |
("γ", "pΚ°"), | |
("γ", "m"), | |
("γ", "f"), | |
("γ", "tβΌ"), | |
("γ", "tΚ°"), | |
("γ", "n"), | |
("γ", "l"), | |
("γ", "kβΌ"), | |
("γ", "kΚ°"), | |
("γ", "x"), | |
("γ", "tΚβΌ"), | |
("γ", "tΚΚ°"), | |
("γ", "Κ"), | |
("γ", "ts`βΌ"), | |
("γ", "ts`Κ°"), | |
("γ", "s`"), | |
("γ", "ΙΉ`"), | |
("γ", "tsβΌ"), | |
("γ", "tsΚ°"), | |
("γ", "|s"), | |
("γ", "|a"), | |
("γ", "|o"), | |
("γ", "|Ι"), | |
("γ", "|Ι"), | |
("γ", "|aΙͺ"), | |
("γ", "|eΙͺ"), | |
("γ ", "|ΙΚ"), | |
("γ‘", "|oΚ"), | |
("γ’", "|an"), | |
("γ£", "|Ιn"), | |
("γ€", "|ΙΕ"), | |
("γ₯", "|ΙΕ"), | |
("γ¦", "ΙΙΉ"), | |
("γ§", "|i"), | |
("γ¨", "|u"), | |
("γ©", "|Ι₯"), | |
("Λ", "β|"), | |
("Λ", "β|"), | |
("Λ", "ββ|"), | |
("Λ", "β|"), | |
("Λ", "|"), | |
] | |
] | |
# Convert numbers to Chinese pronunciation | |
def number_to_chinese(text): | |
# numbers = re.findall(r'\d+(?:\.?\d+)?', text) | |
# for number in numbers: | |
# text = text.replace(number, cn2an.an2cn(number), 1) | |
text = cn2an.transform(text, "an2cn") | |
return text | |
def normalization(text): | |
text = text.replace("οΌ", ",") | |
text = text.replace("γ", ".") | |
text = text.replace("οΌ", "!") | |
text = text.replace("οΌ", "?") | |
text = text.replace("οΌ", ";") | |
text = text.replace("οΌ", ":") | |
text = text.replace("γ", ",") | |
text = text.replace("β", "'") | |
text = text.replace("β", "'") | |
text = text.replace("β―", "β¦") | |
text = text.replace("Β·Β·Β·", "β¦") | |
text = text.replace("γ»γ»γ»", "β¦") | |
text = text.replace("...", "β¦") | |
text = re.sub(r"\s+", "", text) | |
text = re.sub(r"[^\u4e00-\u9fff\s_,\.\?!;:\'β¦]", "", text) | |
text = re.sub(r"\s*([,\.\?!;:\'β¦])\s*", r"\1", text) | |
return text | |
# Word Segmentation, and convert Chinese pronunciation to pinyin (bopomofo) | |
def chinese_to_bopomofo(text): | |
from pypinyin import lazy_pinyin, BOPOMOFO | |
words = jieba.lcut(text, cut_all=False) | |
text = "" | |
for word in words: | |
bopomofos = lazy_pinyin(word, BOPOMOFO) | |
if not re.search("[\u4e00-\u9fff]", word): | |
text += word | |
continue | |
for i in range(len(bopomofos)): | |
bopomofos[i] = re.sub(r"([\u3105-\u3129])$", r"\1Λ", bopomofos[i]) | |
if text != "": | |
text += "|" | |
text += "|".join(bopomofos) | |
return text | |
# Convert latin pronunciation to pinyin (bopomofo) | |
def latin_to_bopomofo(text): | |
for regex, replacement in _latin_to_bopomofo: | |
text = re.sub(regex, replacement, text) | |
return text | |
# Convert pinyin (bopomofo) to IPA | |
def bopomofo_to_ipa(text): | |
for regex, replacement in _bopomofo_to_ipa: | |
text = re.sub(regex, replacement, text) | |
return text | |
def _chinese_to_ipa(text): | |
text = number_to_chinese(text.strip()) | |
text = normalization(text) | |
# print("Normalized text: ", text) | |
text = chinese_to_bopomofo(text) | |
text = latin_to_bopomofo(text) | |
text = bopomofo_to_ipa(text) | |
text = re.sub("([sΙΉ]`[βΌΚ°]?)([βββ ]+|$)", r"\1ΙΉ\2", text) | |
text = re.sub("([s][βΌΚ°]?)([βββ ]+|$)", r"\1ΙΉ\2", text) | |
text = re.sub(r"^\||[^\w\s_,\.\?!;:\'β¦\|ββββΌΚ°`]", "", text) | |
text = re.sub(r"([,\.\?!;:\'β¦])", r"|\1|", text) | |
text = re.sub(r"\|+", "|", text) | |
text = text.rstrip("|") | |
return text | |
# Convert Chinese to IPA | |
def chinese_to_ipa(text, text_tokenizer): | |
# phonemes = text_tokenizer(text.strip()) | |
if type(text) == str: | |
return _chinese_to_ipa(text) | |
else: | |
result_ph = [] | |
for t in text: | |
result_ph.append(_chinese_to_ipa(t)) | |
return result_ph | |