naonauno's picture
Upload 855 files
d66c48f verified
# Copyright (c) 2024 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import re
import jieba
import cn2an
"""
Text clean time
"""
# List of (Latin alphabet, bopomofo) pairs:
_latin_to_bopomofo = [
(re.compile("%s" % x[0], re.IGNORECASE), x[1])
for x in [
("a", "γ„ŸΛ‰"),
("b", "γ„…γ„§Λ‹"),
("c", "γ„™γ„§Λ‰"),
("d", "ㄉㄧˋ"),
("e", "γ„§Λ‹"),
("f", "γ„ΛŠγ„ˆγ„¨Λ‹"),
("g", "ㄐㄧˋ"),
("h", "ㄝˇㄑㄩˋ"),
("i", "γ„žΛ‹"),
("j", "γ„γ„ŸΛ‹"),
("k", "γ„Žγ„ŸΛ‹"),
("l", "γ„ΛŠγ„›Λ‹"),
("m", "γ„ΛŠγ„‡γ„¨Λ‹"),
("n", "γ„£Λ‰"),
("o", "γ„‘Λ‰"),
("p", "ㄆㄧˉ"),
("q", "γ„Žγ„§γ„‘Λ‰"),
("r", "γ„šΛ‹"),
("s", "γ„ΛŠγ„™Λ‹"),
("t", "γ„Šγ„§Λ‹"),
("u", "γ„§γ„‘Λ‰"),
("v", "ㄨㄧˉ"),
("w", "γ„‰γ„šΛ‹γ„…γ„¨Λ‹γ„Œγ„§γ„‘Λ‹"),
("x", "γ„Λ‰γ„Žγ„¨Λ‹γ„™Λ‹"),
("y", "γ„¨γ„žΛ‹"),
("z", "γ„—γ„ŸΛ‹"),
]
]
# List of (bopomofo, ipa) pairs:
_bopomofo_to_ipa = [
(re.compile("%s" % x[0]), x[1])
for x in [
("γ„…γ„›", "p⁼wo"),
("ㄆㄛ", "pΚ°wo"),
("ㄇㄛ", "mwo"),
("γ„ˆγ„›", "fwo"),
("γ„§γ„’", "|jΙ›n"),
("γ„©γ„’", "|Ι₯Γ¦n"),
("γ„§γ„£", "|in"),
("γ„©γ„£", "|Ι₯n"),
("γ„§γ„₯", "|iΕ‹"),
("ㄨγ„₯", "|ΚŠΕ‹"),
("γ„©γ„₯", "|jΚŠΕ‹"),
# Add
("γ„§γ„š", "|ia"),
("ㄧㄝ", "|iΙ›"),
("γ„§γ„ ", "|iΙ‘ΚŠ"),
("γ„§γ„‘", "|ioʊ"),
("γ„§γ„€", "|iΙ‘Ε‹"),
("γ„¨γ„š", "|ua"),
("ㄨㄛ", "|uo"),
("γ„¨γ„ž", "|uaΙͺ"),
("γ„¨γ„Ÿ", "|ueΙͺ"),
("ㄨㄒ", "|uan"),
("ㄨㄣ", "|uΙ™n"),
("ㄨ㄀", "|uΙ‘Ε‹"),
("ㄩㄝ", "|Ι₯Ι›"),
# End
("γ„…", "p⁼"),
("ㄆ", "pΚ°"),
("ㄇ", "m"),
("γ„ˆ", "f"),
("ㄉ", "t⁼"),
("γ„Š", "tΚ°"),
("γ„‹", "n"),
("γ„Œ", "l"),
("ㄍ", "k⁼"),
("γ„Ž", "kΚ°"),
("ㄏ", "x"),
("ㄐ", "tʃ⁼"),
("ㄑ", "tʃʰ"),
("γ„’", "Κƒ"),
("γ„“", "ts`⁼"),
("γ„”", "ts`Κ°"),
("γ„•", "s`"),
("γ„–", "ΙΉ`"),
("γ„—", "ts⁼"),
("γ„˜", "tsΚ°"),
("γ„™", "|s"),
("γ„š", "|a"),
("γ„›", "|o"),
("γ„œ", "|Ι™"),
("ㄝ", "|Ι›"),
("γ„ž", "|aΙͺ"),
("γ„Ÿ", "|eΙͺ"),
("γ„ ", "|Ι‘ΚŠ"),
("γ„‘", "|oʊ"),
("γ„’", "|an"),
("γ„£", "|Ι™n"),
("γ„€", "|Ι‘Ε‹"),
("γ„₯", "|Ι™Ε‹"),
("ㄦ", "Ι™ΙΉ"),
("γ„§", "|i"),
("ㄨ", "|u"),
("γ„©", "|Ι₯"),
("Λ‰", "β†’|"),
("ˊ", "↑|"),
("Λ‡", "↓↑|"),
("Λ‹", "↓|"),
("Λ™", "|"),
]
]
# Convert numbers to Chinese pronunciation
def number_to_chinese(text):
# numbers = re.findall(r'\d+(?:\.?\d+)?', text)
# for number in numbers:
# text = text.replace(number, cn2an.an2cn(number), 1)
text = cn2an.transform(text, "an2cn")
return text
def normalization(text):
text = text.replace(",", ",")
text = text.replace("。", ".")
text = text.replace("!", "!")
text = text.replace("?", "?")
text = text.replace("οΌ›", ";")
text = text.replace(":", ":")
text = text.replace("、", ",")
text = text.replace("β€˜", "'")
text = text.replace("’", "'")
text = text.replace("β‹―", "…")
text = text.replace("Β·Β·Β·", "…")
text = text.replace("・・・", "…")
text = text.replace("...", "…")
text = re.sub(r"\s+", "", text)
text = re.sub(r"[^\u4e00-\u9fff\s_,\.\?!;:\'…]", "", text)
text = re.sub(r"\s*([,\.\?!;:\'…])\s*", r"\1", text)
return text
# Word Segmentation, and convert Chinese pronunciation to pinyin (bopomofo)
def chinese_to_bopomofo(text):
from pypinyin import lazy_pinyin, BOPOMOFO
words = jieba.lcut(text, cut_all=False)
text = ""
for word in words:
bopomofos = lazy_pinyin(word, BOPOMOFO)
if not re.search("[\u4e00-\u9fff]", word):
text += word
continue
for i in range(len(bopomofos)):
bopomofos[i] = re.sub(r"([\u3105-\u3129])$", r"\1Λ‰", bopomofos[i])
if text != "":
text += "|"
text += "|".join(bopomofos)
return text
# Convert latin pronunciation to pinyin (bopomofo)
def latin_to_bopomofo(text):
for regex, replacement in _latin_to_bopomofo:
text = re.sub(regex, replacement, text)
return text
# Convert pinyin (bopomofo) to IPA
def bopomofo_to_ipa(text):
for regex, replacement in _bopomofo_to_ipa:
text = re.sub(regex, replacement, text)
return text
def _chinese_to_ipa(text):
text = number_to_chinese(text.strip())
text = normalization(text)
# print("Normalized text: ", text)
text = chinese_to_bopomofo(text)
text = latin_to_bopomofo(text)
text = bopomofo_to_ipa(text)
text = re.sub("([sΙΉ]`[⁼ʰ]?)([→↓↑ ]+|$)", r"\1ΙΉ\2", text)
text = re.sub("([s][⁼ʰ]?)([→↓↑ ]+|$)", r"\1ΙΉ\2", text)
text = re.sub(r"^\||[^\w\s_,\.\?!;:\'…\|→↓↑⁼ʰ`]", "", text)
text = re.sub(r"([,\.\?!;:\'…])", r"|\1|", text)
text = re.sub(r"\|+", "|", text)
text = text.rstrip("|")
return text
# Convert Chinese to IPA
def chinese_to_ipa(text, text_tokenizer):
# phonemes = text_tokenizer(text.strip())
if type(text) == str:
return _chinese_to_ipa(text)
else:
result_ph = []
for t in text:
result_ph.append(_chinese_to_ipa(t))
return result_ph