Spaces:
Paused
Paused
File size: 2,083 Bytes
d66c48f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# Copyright (c) 2024 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import re
from utils.g2p.japanese import japanese_to_ipa
from utils.g2p.mandarin import chinese_to_ipa
from utils.g2p.english import english_to_ipa
from utils.g2p.french import french_to_ipa
from utils.g2p.korean import korean_to_ipa
from utils.g2p.german import german_to_ipa
patterns = [
r"\[EN\](.*?)\[EN\]",
r"\[ZH\](.*?)\[ZH\]",
r"\[JA\](.*?)\[JA\]",
r"\[FR\](.*?)\[FR\]",
r"\[KR\](.*?)\[KR\]",
r"\[DE\](.*?)\[DE\]",
]
def cje_cleaners(text):
matches = []
for pattern in patterns:
matches.extend(re.finditer(pattern, text))
matches.sort(key=lambda x: x.start()) # Sort matches by their start positions
outputs = ""
for match in matches:
text_segment = text[match.start() : match.end()]
phone = clean_one(text_segment)
outputs += phone
return outputs
def clean_one(text):
if text.find("[ZH]") != -1:
text = re.sub(
r"\[ZH\](.*?)\[ZH\]", lambda x: chinese_to_ipa(x.group(1)) + " ", text
)
if text.find("[JA]") != -1:
text = re.sub(
r"\[JA\](.*?)\[JA\]", lambda x: japanese_to_ipa(x.group(1)) + " ", text
)
if text.find("[EN]") != -1:
text = re.sub(
r"\[EN\](.*?)\[EN\]", lambda x: english_to_ipa(x.group(1)) + " ", text
)
if text.find("[FR]") != -1:
text = re.sub(
r"\[FR\](.*?)\[FR\]", lambda x: french_to_ipa(x.group(1)) + " ", text
)
if text.find("[KR]") != -1:
text = re.sub(
r"\[KR\](.*?)\[KR\]", lambda x: korean_to_ipa(x.group(1)) + " ", text
)
if text.find("[DE]") != -1:
text = re.sub(
r"\[DE\](.*?)\[DE\]", lambda x: german_to_ipa(x.group(1)) + " ", text
)
text = re.sub(r"\s+$", "", text)
text = re.sub(r"([^\.,!\?\-…~])$", r"\1.", text)
return text
|