File size: 6,440 Bytes
7934b29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import logging
import os
import random
from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest

# Checks -
# (Recommendation) Please normalize the text for each language (avoid numbers, special characters, punctuation)
# Please ensure that the audio_fielpaths are absolute locations


parser = argparse.ArgumentParser(description='Create synthetic code-switching data manifest from monolingual data')

parser.add_argument("--manifest_language1", default=None, type=str, help='Manifest file for language 1', required=True)
parser.add_argument("--manifest_language2", default=None, type=str, help='Manifest file for language 2', required=True)
parser.add_argument(
    "--manifest_save_path", default=None, type=str, help='Path to save created CS indermediate manifest', required=True
)
parser.add_argument(
    "--id_language1", default=None, type=str, help='Identifier for language 1, eg: en, es, hi', required=True
)
parser.add_argument(
    "--id_language2", default=None, type=str, help='Identifier for language 2, eg: en, es, hi', required=True
)
parser.add_argument("--max_sample_duration_sec", default=19, type=int, help='Maximum duration of sample (sec)')
parser.add_argument("--min_sample_duration_sec", default=16, type=int, help='Minimum duration of sample (sec)')
parser.add_argument("--dataset_size_required_hrs", default=1, type=int, help='Duration of dataset required (hrs)')

args = parser.parse_args()


def create_cs_manifest(
    data_lang_0: list,
    data_lang_1: list,
    lid_lang_0: str,
    lid_lang_1: str,
    max_sample_duration_sec: int,
    min_sample_duration_sec: int,
    data_requirement_hrs: int,
):
    """
    Args:
        data_lang_0: Manifest entries from first langauge
        data_lang_1: Manifest entries from second langauge
        lid_lang_0: Language ID marker for first langauge
        lid_lang_1: Language ID marker for second langauge
        max_sample_duration_sec: Maximum permissible duration of generated CS sample in sec
        min_sample_duration_sec: Minimum permissible duration of generated CS sample in sec
        data_requirement_hrs: Required size of generated corpus

    Returns:
        Created synthetic CS manifest as list

    """

    total_duration = 0
    constructed_data = []
    sample_id = 0

    num_samples_lang0 = len(data_lang_0)
    num_samples_lang1 = len(data_lang_1)

    while total_duration < (data_requirement_hrs * 3600):

        created_sample_duration_sec = 0
        created_sample_dict = {}
        created_sample_dict['lang_ids'] = []
        created_sample_dict['texts'] = []
        created_sample_dict['paths'] = []
        created_sample_dict['durations'] = []

        while created_sample_duration_sec < min_sample_duration_sec:

            lang_selection = random.randint(0, 1)

            if lang_selection == 0:
                index = random.randint(0, num_samples_lang0 - 1)
                sample = data_lang_0[index]
                lang_id = lid_lang_0
            else:
                index = random.randint(0, num_samples_lang1 - 1)
                sample = data_lang_1[index]
                lang_id = lid_lang_1

            if (created_sample_duration_sec + sample['duration']) > max_sample_duration_sec:
                continue
            else:
                created_sample_duration_sec += sample['duration']
                created_sample_dict['lang_ids'].append(lang_id)
                created_sample_dict['texts'].append(sample['text'])
                created_sample_dict['paths'].append(sample['audio_filepath'])
                created_sample_dict['durations'].append(sample['duration'])

        created_sample_dict['total_duration'] = created_sample_duration_sec

        # adding a uid which will be used to save the generated audio file later
        created_sample_dict['uid'] = sample_id
        sample_id += 1

        constructed_data.append(created_sample_dict)
        total_duration += created_sample_duration_sec

    return constructed_data


def main():

    manifest0 = args.manifest_language1
    manifest1 = args.manifest_language2
    lid0 = args.id_language1
    lid1 = args.id_language2
    min_sample_duration = args.min_sample_duration_sec
    max_sample_duration = args.max_sample_duration_sec
    dataset_requirement = args.dataset_size_required_hrs
    manifest_save_path = args.manifest_save_path

    # Sanity Checks
    if (manifest0 is None) or (not os.path.exists(manifest0)):
        logging.error('Manifest for language 1 is incorrect')
        exit

    if (manifest1 is None) or (not os.path.exists(manifest1)):
        logging.error('Manifest for language 2 is incorrect')
        exit

    if lid0 is None:
        logging.error('Please provide correct language code for language 1')
        exit

    if lid1 is None:
        logging.error('Please provide correct language code for language 2')
        exit

    if manifest_save_path is None:
        logging.error('Please provide correct manifest save path')
        exit

    if min_sample_duration >= max_sample_duration:
        logging.error('Please ensure max_sample_duration > min_sample_duration')
        exit

    # Reading data
    logging.info('Reading manifests')
    data_language0 = read_manifest(manifest0)
    data_language1 = read_manifest(manifest1)

    # Creating the CS data Manifest
    logging.info('Creating CS manifest')
    constructed_data = create_cs_manifest(
        data_language0, data_language1, lid0, lid1, max_sample_duration, min_sample_duration, dataset_requirement
    )

    # Saving Manifest
    logging.info('saving manifest')
    write_manifest(manifest_save_path, constructed_data)

    print("Synthetic CS manifest saved at :", manifest_save_path)

    logging.info('Done!')


if __name__ == "__main__":
    main()