File size: 4,393 Bytes
7934b29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import glob
import json
import re
import tarfile
import urllib.request
from pathlib import Path

from tqdm import tqdm


def get_args():
    parser = argparse.ArgumentParser(description='Download HiFiTTS and create manifests with predefined split')
    parser.add_argument(
        "--data-root",
        required=True,
        type=Path,
        help='Directory into which to download and extract dataset. \{data-root\}/hi_fi_tts_v0 will be created.',
    )
    parser.add_argument(
        '--split',
        type=str,
        default='all',
        help='Choose to generate manifest for all or one of (train, test, split), note that this will still download the full dataset.',
    )

    args = parser.parse_args()
    return args


URL = "https://us.openslr.org/resources/109/hi_fi_tts_v0.tar.gz"


def __maybe_download_file(source_url, destination_path):
    if not destination_path.exists():
        tmp_file_path = destination_path.with_suffix('.tmp')
        urllib.request.urlretrieve(source_url, filename=str(tmp_file_path))
        tmp_file_path.rename(destination_path)


def __extract_file(filepath, data_dir):
    try:
        tar = tarfile.open(filepath)
        tar.extractall(data_dir)
        tar.close()
    except Exception:
        print(f"Error while extracting {filepath}. Already extracted?")


def __process_data(data_root, filelists):
    # Create manifests (based on predefined NVIDIA's split)
    for split in tqdm(filelists):
        manifest_target = data_root / f"{split}_manifest.json"
        print(f"Creating manifest for {split}.")

        entries = []
        for manifest_src in glob.glob(str(data_root / f"*_{split}.json")):
            try:
                search_res = re.search('.*\/([0-9]+)_manifest_([a-z]+)_.*.json', manifest_src)
                speaker_id = search_res.group(1)
                audio_quality = search_res.group(2)
            except Exception:
                print(f"Failed to find speaker id or audio quality for {manifest_src}, check formatting.")
                continue

            with open(manifest_src, 'r') as f_in:
                for input_json_entry in f_in:
                    data = json.loads(input_json_entry)

                    # Make sure corresponding wavfile exists
                    wav_path = data_root / data['audio_filepath']
                    assert wav_path.exists(), f"{wav_path} does not exist!"

                    entry = {
                        'audio_filepath': data['audio_filepath'],
                        'duration': data['duration'],
                        'text': data['text'],
                        'normalized_text': data['text_normalized'],
                        'speaker': int(speaker_id),
                        # Audio_quality is either clean or other.
                        # The clean set includes recordings with high sound-to-noise ratio and wide bandwidth.
                        # The books with noticeable noise or narrow bandwidth are included in the other subset.
                        # Note: some speaker_id's have both clean and other audio quality.
                        'audio_quality': audio_quality,
                    }
                    entries.append(entry)

        with open(manifest_target, 'w') as f_out:
            for m in entries:
                f_out.write(json.dumps(m) + '\n')


def main():
    args = get_args()

    split = ['train', 'dev', 'test'] if args.split == 'all' else list(args.split)

    tarred_data_path = args.data_root / "hi_fi_tts_v0.tar.gz"

    __maybe_download_file(URL, tarred_data_path)
    __extract_file(str(tarred_data_path), str(args.data_root))

    data_root = args.data_root / "hi_fi_tts_v0"
    __process_data(data_root, split)


if __name__ == '__main__':
    main()