File size: 6,440 Bytes
7934b29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import os
import random
from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest
# Checks -
# (Recommendation) Please normalize the text for each language (avoid numbers, special characters, punctuation)
# Please ensure that the audio_fielpaths are absolute locations
parser = argparse.ArgumentParser(description='Create synthetic code-switching data manifest from monolingual data')
parser.add_argument("--manifest_language1", default=None, type=str, help='Manifest file for language 1', required=True)
parser.add_argument("--manifest_language2", default=None, type=str, help='Manifest file for language 2', required=True)
parser.add_argument(
"--manifest_save_path", default=None, type=str, help='Path to save created CS indermediate manifest', required=True
)
parser.add_argument(
"--id_language1", default=None, type=str, help='Identifier for language 1, eg: en, es, hi', required=True
)
parser.add_argument(
"--id_language2", default=None, type=str, help='Identifier for language 2, eg: en, es, hi', required=True
)
parser.add_argument("--max_sample_duration_sec", default=19, type=int, help='Maximum duration of sample (sec)')
parser.add_argument("--min_sample_duration_sec", default=16, type=int, help='Minimum duration of sample (sec)')
parser.add_argument("--dataset_size_required_hrs", default=1, type=int, help='Duration of dataset required (hrs)')
args = parser.parse_args()
def create_cs_manifest(
data_lang_0: list,
data_lang_1: list,
lid_lang_0: str,
lid_lang_1: str,
max_sample_duration_sec: int,
min_sample_duration_sec: int,
data_requirement_hrs: int,
):
"""
Args:
data_lang_0: Manifest entries from first langauge
data_lang_1: Manifest entries from second langauge
lid_lang_0: Language ID marker for first langauge
lid_lang_1: Language ID marker for second langauge
max_sample_duration_sec: Maximum permissible duration of generated CS sample in sec
min_sample_duration_sec: Minimum permissible duration of generated CS sample in sec
data_requirement_hrs: Required size of generated corpus
Returns:
Created synthetic CS manifest as list
"""
total_duration = 0
constructed_data = []
sample_id = 0
num_samples_lang0 = len(data_lang_0)
num_samples_lang1 = len(data_lang_1)
while total_duration < (data_requirement_hrs * 3600):
created_sample_duration_sec = 0
created_sample_dict = {}
created_sample_dict['lang_ids'] = []
created_sample_dict['texts'] = []
created_sample_dict['paths'] = []
created_sample_dict['durations'] = []
while created_sample_duration_sec < min_sample_duration_sec:
lang_selection = random.randint(0, 1)
if lang_selection == 0:
index = random.randint(0, num_samples_lang0 - 1)
sample = data_lang_0[index]
lang_id = lid_lang_0
else:
index = random.randint(0, num_samples_lang1 - 1)
sample = data_lang_1[index]
lang_id = lid_lang_1
if (created_sample_duration_sec + sample['duration']) > max_sample_duration_sec:
continue
else:
created_sample_duration_sec += sample['duration']
created_sample_dict['lang_ids'].append(lang_id)
created_sample_dict['texts'].append(sample['text'])
created_sample_dict['paths'].append(sample['audio_filepath'])
created_sample_dict['durations'].append(sample['duration'])
created_sample_dict['total_duration'] = created_sample_duration_sec
# adding a uid which will be used to save the generated audio file later
created_sample_dict['uid'] = sample_id
sample_id += 1
constructed_data.append(created_sample_dict)
total_duration += created_sample_duration_sec
return constructed_data
def main():
manifest0 = args.manifest_language1
manifest1 = args.manifest_language2
lid0 = args.id_language1
lid1 = args.id_language2
min_sample_duration = args.min_sample_duration_sec
max_sample_duration = args.max_sample_duration_sec
dataset_requirement = args.dataset_size_required_hrs
manifest_save_path = args.manifest_save_path
# Sanity Checks
if (manifest0 is None) or (not os.path.exists(manifest0)):
logging.error('Manifest for language 1 is incorrect')
exit
if (manifest1 is None) or (not os.path.exists(manifest1)):
logging.error('Manifest for language 2 is incorrect')
exit
if lid0 is None:
logging.error('Please provide correct language code for language 1')
exit
if lid1 is None:
logging.error('Please provide correct language code for language 2')
exit
if manifest_save_path is None:
logging.error('Please provide correct manifest save path')
exit
if min_sample_duration >= max_sample_duration:
logging.error('Please ensure max_sample_duration > min_sample_duration')
exit
# Reading data
logging.info('Reading manifests')
data_language0 = read_manifest(manifest0)
data_language1 = read_manifest(manifest1)
# Creating the CS data Manifest
logging.info('Creating CS manifest')
constructed_data = create_cs_manifest(
data_language0, data_language1, lid0, lid1, max_sample_duration, min_sample_duration, dataset_requirement
)
# Saving Manifest
logging.info('saving manifest')
write_manifest(manifest_save_path, constructed_data)
print("Synthetic CS manifest saved at :", manifest_save_path)
logging.info('Done!')
if __name__ == "__main__":
main()
|