Spaces:
Paused
Paused
# Copyright (c) 2024 Amphion. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
import re | |
from utils.g2p.japanese import japanese_to_ipa | |
from utils.g2p.mandarin import chinese_to_ipa | |
from utils.g2p.english import english_to_ipa | |
from utils.g2p.french import french_to_ipa | |
from utils.g2p.korean import korean_to_ipa | |
from utils.g2p.german import german_to_ipa | |
patterns = [ | |
r"\[EN\](.*?)\[EN\]", | |
r"\[ZH\](.*?)\[ZH\]", | |
r"\[JA\](.*?)\[JA\]", | |
r"\[FR\](.*?)\[FR\]", | |
r"\[KR\](.*?)\[KR\]", | |
r"\[DE\](.*?)\[DE\]", | |
] | |
def cje_cleaners(text): | |
matches = [] | |
for pattern in patterns: | |
matches.extend(re.finditer(pattern, text)) | |
matches.sort(key=lambda x: x.start()) # Sort matches by their start positions | |
outputs = "" | |
for match in matches: | |
text_segment = text[match.start() : match.end()] | |
phone = clean_one(text_segment) | |
outputs += phone | |
return outputs | |
def clean_one(text): | |
if text.find("[ZH]") != -1: | |
text = re.sub( | |
r"\[ZH\](.*?)\[ZH\]", lambda x: chinese_to_ipa(x.group(1)) + " ", text | |
) | |
if text.find("[JA]") != -1: | |
text = re.sub( | |
r"\[JA\](.*?)\[JA\]", lambda x: japanese_to_ipa(x.group(1)) + " ", text | |
) | |
if text.find("[EN]") != -1: | |
text = re.sub( | |
r"\[EN\](.*?)\[EN\]", lambda x: english_to_ipa(x.group(1)) + " ", text | |
) | |
if text.find("[FR]") != -1: | |
text = re.sub( | |
r"\[FR\](.*?)\[FR\]", lambda x: french_to_ipa(x.group(1)) + " ", text | |
) | |
if text.find("[KR]") != -1: | |
text = re.sub( | |
r"\[KR\](.*?)\[KR\]", lambda x: korean_to_ipa(x.group(1)) + " ", text | |
) | |
if text.find("[DE]") != -1: | |
text = re.sub( | |
r"\[DE\](.*?)\[DE\]", lambda x: german_to_ipa(x.group(1)) + " ", text | |
) | |
text = re.sub(r"\s+$", "", text) | |
text = re.sub(r"([^\.,!\?\-…~])$", r"\1.", text) | |
return text | |