File size: 5,107 Bytes
7934b29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import os
from argparse import ArgumentParser
from typing import Dict
from syllabify import syllabify
"""
Usage:
cd NeMo/scripts && python dataset_processing/g2p/convert_cmu_arpabet_to_ipa.py
"""
def parse_args():
parser = ArgumentParser("ARPABET to IPA conversion sctipt")
parser.add_argument(
'--cmu_arpabet',
help="Path to CMU ARPABET dictionary file",
type=str,
default="tts_dataset_files/cmudict-0.7b_nv22.10",
)
parser.add_argument("--ipa_out", help="Path to save IPA version of the dictionary", type=str, required=True)
parser.add_argument(
"--mapping",
help="ARPABET to IPA phoneme mapping file",
type=str,
default="tts_dataset_files/cmudict-arpabet_to_ipa_nv22.10.tsv",
)
return parser.parse_args()
def convert_arp_to_ipa(arp_to_ipa_dict: Dict[str, str], arp_input: str, remove_space: bool = False) -> str:
"""
Converts ARPABET phoneme to IPA based on arp_to_ipa_dict mapping
Args:
arp_to_ipa_dict: ARPABET to IPA phonemes mapping
arp_input: ARPABET input
remove_space: set to TRUE to remove spaces between IPA phonemes
Returns:
input word in IPA form
"""
primary_stress = "ˈ"
secondary_stress = "ˌ"
stress_dict = {"0": "", "1": primary_stress, "2": secondary_stress}
word_ipa = ""
phonemes = arp_input.split()
# split ARPABET phoneme input into syllables,
# e.g. syllabify(["HH", "AH0", "L", "OW1"]) -> [(['HH'], ['AH0'], []), (['L'], ['OW1'], [])]
syllables = syllabify(phonemes)
for syl_idx, syll in enumerate(syllables):
syll_stress = ""
syll_ipa = ""
# syll is a tuple of lists of phonemes, here we flatten it and get rid of empty entries,
# e.g. (['HH'], ['AH0'], []) -> ['HH', 'AH0']
syll = [x for x in itertools.chain.from_iterable(syll)]
for phon_idx, phon in enumerate(syll):
if phon[-1].isdigit():
syll_stress = phon[-1]
if syll_stress not in stress_dict:
raise ValueError(f"{syll_stress} unknown")
syll_stress = stress_dict[syll_stress]
# some phonemes are followed by a digit that represents stress, e.g., `AH0`
if phon not in arp_to_ipa_dict and phon[-1].isdigit():
phon = phon[:-1]
if phon not in arp_to_ipa_dict:
raise ValueError(f"|{phon}| phoneme not found in |{arp_input}|")
else:
ipa_phone = arp_to_ipa_dict[phon]
syll_ipa += ipa_phone + " "
word_ipa += " " + syll_stress + syll_ipa.strip()
word_ipa = word_ipa.strip()
if remove_space:
word_ipa = word_ipa.replace(" ", "")
return word_ipa
def _get_arpabet_to_ipa_mapping(arp_ipa_map_file: str) -> Dict[str, str]:
"""
arp_ipa_map_file: Arpabet to IPA phonemes mapping
"""
arp_to_ipa = {}
with open(arp_ipa_map_file, "r", encoding="utf-8") as f:
for line in f:
arp, ipa = line.strip().split("\t")
arp_to_ipa[arp] = ipa
return arp_to_ipa
def convert_cmu_arpabet_to_ipa(arp_ipa_map_file: str, arp_dict_file: str, output_ipa_file: str):
"""
Converts CMU ARPABET-based dictionary to IPA.
Args:
arp_ipa_map_file: ARPABET to IPA phoneme mapping file
arp_dict_file: path to ARPABET version of CMU dictionary
output_ipa_file: path to output IPA version of CMU dictionary
"""
arp_to_ipa_dict = _get_arpabet_to_ipa_mapping(arp_ipa_map_file)
with open(arp_dict_file, "r", encoding="utf-8") as f_arp, open(output_ipa_file, "w", encoding="utf-8") as f_ipa:
for line in f_arp:
if line.startswith(";;;"):
f_ipa.write(line)
else:
# First, split the line at " #" if there are comments in the dictionary file following the mapping entries.
# Next, split at default " " separator.
graphemes, phonemes = line.split(" #")[0].strip().split(" ")
ipa_form = convert_arp_to_ipa(arp_to_ipa_dict, phonemes, remove_space=True)
f_ipa.write(f"{graphemes} {ipa_form}\n")
print(f"IPA version of {os.path.abspath(arp_dict_file)} saved in {os.path.abspath(output_ipa_file)}")
if __name__ == "__main__":
args = parse_args()
convert_cmu_arpabet_to_ipa(args.mapping, args.cmu_arpabet, args.ipa_out)
|