# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import glob import json import re import tarfile import urllib.request from pathlib import Path from tqdm import tqdm def get_args(): parser = argparse.ArgumentParser(description='Download HiFiTTS and create manifests with predefined split') parser.add_argument( "--data-root", required=True, type=Path, help='Directory into which to download and extract dataset. \{data-root\}/hi_fi_tts_v0 will be created.', ) parser.add_argument( '--split', type=str, default='all', help='Choose to generate manifest for all or one of (train, test, split), note that this will still download the full dataset.', ) args = parser.parse_args() return args URL = "https://us.openslr.org/resources/109/hi_fi_tts_v0.tar.gz" def __maybe_download_file(source_url, destination_path): if not destination_path.exists(): tmp_file_path = destination_path.with_suffix('.tmp') urllib.request.urlretrieve(source_url, filename=str(tmp_file_path)) tmp_file_path.rename(destination_path) def __extract_file(filepath, data_dir): try: tar = tarfile.open(filepath) tar.extractall(data_dir) tar.close() except Exception: print(f"Error while extracting {filepath}. Already extracted?") def __process_data(data_root, filelists): # Create manifests (based on predefined NVIDIA's split) for split in tqdm(filelists): manifest_target = data_root / f"{split}_manifest.json" print(f"Creating manifest for {split}.") entries = [] for manifest_src in glob.glob(str(data_root / f"*_{split}.json")): try: search_res = re.search('.*\/([0-9]+)_manifest_([a-z]+)_.*.json', manifest_src) speaker_id = search_res.group(1) audio_quality = search_res.group(2) except Exception: print(f"Failed to find speaker id or audio quality for {manifest_src}, check formatting.") continue with open(manifest_src, 'r') as f_in: for input_json_entry in f_in: data = json.loads(input_json_entry) # Make sure corresponding wavfile exists wav_path = data_root / data['audio_filepath'] assert wav_path.exists(), f"{wav_path} does not exist!" entry = { 'audio_filepath': data['audio_filepath'], 'duration': data['duration'], 'text': data['text'], 'normalized_text': data['text_normalized'], 'speaker': int(speaker_id), # Audio_quality is either clean or other. # The clean set includes recordings with high sound-to-noise ratio and wide bandwidth. # The books with noticeable noise or narrow bandwidth are included in the other subset. # Note: some speaker_id's have both clean and other audio quality. 'audio_quality': audio_quality, } entries.append(entry) with open(manifest_target, 'w') as f_out: for m in entries: f_out.write(json.dumps(m) + '\n') def main(): args = get_args() split = ['train', 'dev', 'test'] if args.split == 'all' else list(args.split) tarred_data_path = args.data_root / "hi_fi_tts_v0.tar.gz" __maybe_download_file(URL, tarred_data_path) __extract_file(str(tarred_data_path), str(args.data_root)) data_root = args.data_root / "hi_fi_tts_v0" __process_data(data_root, split) if __name__ == '__main__': main()