naonauno's picture
Upload 855 files
d66c48f verified
"""https://github.com/bootphon/phonemizer"""
import re
from phonemizer import phonemize
from phonemizer.separator import Separator
# List of (regular expression, replacement) pairs for abbreviations in french:
_abbreviations = [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
("M", "monsieur"),
("Mlle", "mademoiselle"),
("Mlles", "mesdemoiselles"),
("Mme", "Madame"),
("Mmes", "Mesdames"),
("N.B", "nota bene"),
("M", "monsieur"),
("p.c.q", "parce que"),
("Pr", "professeur"),
("qqch", "quelque chose"),
("rdv", "rendez-vous"),
("max", "maximum"),
("min", "minimum"),
("no", "numéro"),
("adr", "adresse"),
("dr", "docteur"),
("st", "saint"),
("co", "companie"),
("jr", "junior"),
("sgt", "sergent"),
("capt", "capitain"),
("col", "colonel"),
("av", "avenue"),
("av. J.-C", "avant Jésus-Christ"),
("apr. J.-C", "après Jésus-Christ"),
("art", "article"),
("boul", "boulevard"),
("c.-à-d", "c’est-à-dire"),
("etc", "et cetera"),
("ex", "exemple"),
("excl", "exclusivement"),
("boul", "boulevard"),
]
] + [
(re.compile("\\b%s" % x[0]), x[1])
for x in [
("Mlle", "mademoiselle"),
("Mlles", "mesdemoiselles"),
("Mme", "Madame"),
("Mmes", "Mesdames"),
]
]
rep_map = {
":": ",",
";": ",",
",": ",",
"。": ".",
"!": "!",
"?": "?",
"\n": ".",
"·": ",",
"、": ",",
"...": ".",
"…": ".",
"$": ".",
"“": "",
"”": "",
"‘": "",
"’": "",
"(": "",
")": "",
"(": "",
")": "",
"《": "",
"》": "",
"【": "",
"】": "",
"[": "",
"]": "",
"—": "",
"~": "-",
"~": "-",
"「": "",
"」": "",
"¿": "",
"¡": "",
}
_special_map = [
(re.compile("%s" % x[0]), x[1])
for x in [
("ø", "ɸ"),
("ː", ":"),
("j", "jˈ"), # To avoid incorrect connect
("n", "ˈn"), # To avoid incorrect connect
("w", "wˈ"), # To avoid incorrect connect
("ã", "a~"),
("ɑ̃", "ɑ~"),
("ɔ̃", "ɔ~"),
("ɛ̃", "ɛ~"),
("œ̃", "œ~"),
]
]
def collapse_whitespace(text):
# Regular expression matching whitespace:
_whitespace_re = re.compile(r"\s+")
return re.sub(_whitespace_re, " ", text).strip()
def remove_punctuation_at_begin(text):
return re.sub(r"^[,.!?]+", "", text)
def remove_aux_symbols(text):
text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
return text
def replace_symbols(text):
text = text.replace(";", ",")
text = text.replace("-", " ")
text = text.replace(":", ",")
text = text.replace("&", " et ")
return text
def expand_abbreviations(text):
for regex, replacement in _abbreviations:
text = re.sub(regex, replacement, text)
return text
def replace_punctuation(text):
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
return replaced_text
def text_normalize(text):
text = expand_abbreviations(text)
text = replace_punctuation(text)
text = replace_symbols(text)
text = remove_aux_symbols(text)
text = remove_punctuation_at_begin(text)
text = collapse_whitespace(text)
text = re.sub(r"([^\.,!\?\-…])$", r"\1.", text)
return text
# special map
def special_map(text):
for regex, replacement in _special_map:
text = re.sub(regex, replacement, text)
return text
def french_to_ipa(text):
text = text_normalize(text)
ipa = phonemize(
text.strip(),
language="fr-fr",
backend="espeak",
separator=Separator(phone=None, word=" ", syllable="|"),
strip=True,
preserve_punctuation=True,
njobs=4,
)
# remove "(en)" and "(fr)" tag
ipa = ipa.replace("(en)", "").replace("(fr)", "")
ipa = special_map(ipa)
return ipa