NeMo / scripts /speech_recognition /code_switching /code_switching_manifest_creation.py
camenduru's picture
thanks to NVIDIA ❤
7934b29
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import os
import random
from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest
# Checks -
# (Recommendation) Please normalize the text for each language (avoid numbers, special characters, punctuation)
# Please ensure that the audio_fielpaths are absolute locations
parser = argparse.ArgumentParser(description='Create synthetic code-switching data manifest from monolingual data')
parser.add_argument("--manifest_language1", default=None, type=str, help='Manifest file for language 1', required=True)
parser.add_argument("--manifest_language2", default=None, type=str, help='Manifest file for language 2', required=True)
parser.add_argument(
"--manifest_save_path", default=None, type=str, help='Path to save created CS indermediate manifest', required=True
)
parser.add_argument(
"--id_language1", default=None, type=str, help='Identifier for language 1, eg: en, es, hi', required=True
)
parser.add_argument(
"--id_language2", default=None, type=str, help='Identifier for language 2, eg: en, es, hi', required=True
)
parser.add_argument("--max_sample_duration_sec", default=19, type=int, help='Maximum duration of sample (sec)')
parser.add_argument("--min_sample_duration_sec", default=16, type=int, help='Minimum duration of sample (sec)')
parser.add_argument("--dataset_size_required_hrs", default=1, type=int, help='Duration of dataset required (hrs)')
args = parser.parse_args()
def create_cs_manifest(
data_lang_0: list,
data_lang_1: list,
lid_lang_0: str,
lid_lang_1: str,
max_sample_duration_sec: int,
min_sample_duration_sec: int,
data_requirement_hrs: int,
):
"""
Args:
data_lang_0: Manifest entries from first langauge
data_lang_1: Manifest entries from second langauge
lid_lang_0: Language ID marker for first langauge
lid_lang_1: Language ID marker for second langauge
max_sample_duration_sec: Maximum permissible duration of generated CS sample in sec
min_sample_duration_sec: Minimum permissible duration of generated CS sample in sec
data_requirement_hrs: Required size of generated corpus
Returns:
Created synthetic CS manifest as list
"""
total_duration = 0
constructed_data = []
sample_id = 0
num_samples_lang0 = len(data_lang_0)
num_samples_lang1 = len(data_lang_1)
while total_duration < (data_requirement_hrs * 3600):
created_sample_duration_sec = 0
created_sample_dict = {}
created_sample_dict['lang_ids'] = []
created_sample_dict['texts'] = []
created_sample_dict['paths'] = []
created_sample_dict['durations'] = []
while created_sample_duration_sec < min_sample_duration_sec:
lang_selection = random.randint(0, 1)
if lang_selection == 0:
index = random.randint(0, num_samples_lang0 - 1)
sample = data_lang_0[index]
lang_id = lid_lang_0
else:
index = random.randint(0, num_samples_lang1 - 1)
sample = data_lang_1[index]
lang_id = lid_lang_1
if (created_sample_duration_sec + sample['duration']) > max_sample_duration_sec:
continue
else:
created_sample_duration_sec += sample['duration']
created_sample_dict['lang_ids'].append(lang_id)
created_sample_dict['texts'].append(sample['text'])
created_sample_dict['paths'].append(sample['audio_filepath'])
created_sample_dict['durations'].append(sample['duration'])
created_sample_dict['total_duration'] = created_sample_duration_sec
# adding a uid which will be used to save the generated audio file later
created_sample_dict['uid'] = sample_id
sample_id += 1
constructed_data.append(created_sample_dict)
total_duration += created_sample_duration_sec
return constructed_data
def main():
manifest0 = args.manifest_language1
manifest1 = args.manifest_language2
lid0 = args.id_language1
lid1 = args.id_language2
min_sample_duration = args.min_sample_duration_sec
max_sample_duration = args.max_sample_duration_sec
dataset_requirement = args.dataset_size_required_hrs
manifest_save_path = args.manifest_save_path
# Sanity Checks
if (manifest0 is None) or (not os.path.exists(manifest0)):
logging.error('Manifest for language 1 is incorrect')
exit
if (manifest1 is None) or (not os.path.exists(manifest1)):
logging.error('Manifest for language 2 is incorrect')
exit
if lid0 is None:
logging.error('Please provide correct language code for language 1')
exit
if lid1 is None:
logging.error('Please provide correct language code for language 2')
exit
if manifest_save_path is None:
logging.error('Please provide correct manifest save path')
exit
if min_sample_duration >= max_sample_duration:
logging.error('Please ensure max_sample_duration > min_sample_duration')
exit
# Reading data
logging.info('Reading manifests')
data_language0 = read_manifest(manifest0)
data_language1 = read_manifest(manifest1)
# Creating the CS data Manifest
logging.info('Creating CS manifest')
constructed_data = create_cs_manifest(
data_language0, data_language1, lid0, lid1, max_sample_duration, min_sample_duration, dataset_requirement
)
# Saving Manifest
logging.info('saving manifest')
write_manifest(manifest_save_path, constructed_data)
print("Synthetic CS manifest saved at :", manifest_save_path)
logging.info('Done!')
if __name__ == "__main__":
main()