# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This script is used to generate JSON manifests for mel-generator model training. The usage is below. $ python scripts/dataset_processing/tts/thorsten_neutral/get_data.py \ --data-root ~/experiments/thorsten_neutral \ --manifests-root ~/experiments/thorsten_neutral \ --data-version "22_10" \ --min-duration 0.1 \ --normalize-text """ import argparse import json import random import shutil import subprocess import urllib.request from pathlib import Path from joblib import Parallel, delayed from nemo_text_processing.text_normalization.normalize import Normalizer from tqdm import tqdm from nemo.utils import logging # Thorsten Müller published two neural voice datasets, 21.02 and 22.10. THORSTEN_NEUTRAL = { "21_02": { "url": "https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1", "dir_name": "thorsten-de_v03", "metadata": ["metadata.csv"], }, "22_10": { "url": "https://zenodo.org/record/7265581/files/ThorstenVoice-Dataset_2022.10.zip?download=1", "dir_name": "ThorstenVoice-Dataset_2022.10", "metadata": ["metadata_train.csv", "metadata_dev.csv", "metadata_test.csv"], }, } def get_args(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Download Thorsten Müller's neutral voice dataset and create manifests with predefined split. " "Thorsten Müller published two neural voice datasets, 21.02 and 22.10, where 22.10 provides better " "audio quality. Please choose one of the two for your TTS models. Details about the dataset are " "in https://github.com/thorstenMueller/Thorsten-Voice.", ) parser.add_argument("--data-root", required=True, type=Path, help="where the resulting dataset will reside.") parser.add_argument("--manifests-root", required=True, type=Path, help="where the manifests files will reside.") parser.add_argument("--data-version", default="22_10", choices=["21_02", "22_10"], type=str) parser.add_argument("--min-duration", default=0.1, type=float) parser.add_argument("--max-duration", default=float('inf'), type=float) parser.add_argument("--val-size", default=100, type=int) parser.add_argument("--test-size", default=100, type=int) parser.add_argument( "--num-workers", default=-1, type=int, help="Specify the max number of concurrent Python worker processes. " "If -1 all CPUs are used. If 1 no parallel computing is used.", ) parser.add_argument( "--normalize-text", default=False, action='store_true', help="Normalize original text and add a new entry 'normalized_text' to .json file if True.", ) parser.add_argument( "--seed-for-ds-split", default=100, type=float, help="Seed for deterministic split of train/dev/test, NVIDIA's default is 100.", ) args = parser.parse_args() return args def __maybe_download_file(source_url, destination_path): if not destination_path.exists(): logging.info(f"Downloading data: {source_url} --> {destination_path}") tmp_file_path = destination_path.with_suffix(".tmp") urllib.request.urlretrieve(source_url, filename=tmp_file_path) tmp_file_path.rename(destination_path) else: logging.info(f"Skipped downloading data because it exists: {destination_path}") def __extract_file(filepath, data_dir): logging.info(f"Unzipping data: {filepath} --> {data_dir}") shutil.unpack_archive(filepath, data_dir) logging.info(f"Unzipping data is complete: {filepath}.") def __save_json(json_file, dict_list): logging.info(f"Saving JSON split to {json_file}.") with open(json_file, "w") as f: for d in dict_list: f.write(json.dumps(d) + "\n") def __text_normalization(json_file, num_workers=-1): text_normalizer_call_kwargs = { "punct_pre_process": True, "punct_post_process": True, } text_normalizer = Normalizer( lang="de", input_case="cased", overwrite_cache=True, cache_dir=str(json_file.parent / "cache_dir"), ) def normalizer_call(x): return text_normalizer.normalize(x, **text_normalizer_call_kwargs) def add_normalized_text(line_dict): normalized_text = normalizer_call(line_dict["text"]) line_dict.update({"normalized_text": normalized_text}) return line_dict logging.info(f"Normalizing text for {json_file}.") with open(json_file, 'r', encoding='utf-8') as fjson: lines = fjson.readlines() # Note: you need to verify which backend works well on your cluster. # backend="loky" is fine on multi-core Ubuntu OS; backend="threading" on Slurm. dict_list = Parallel(n_jobs=num_workers)( delayed(add_normalized_text)(json.loads(line)) for line in tqdm(lines) ) json_file_text_normed = json_file.parent / f"{json_file.stem}_text_normed{json_file.suffix}" with open(json_file_text_normed, 'w', encoding="utf-8") as fjson_norm: for dct in dict_list: fjson_norm.write(json.dumps(dct) + "\n") logging.info(f"Normalizing text is complete: {json_file} --> {json_file_text_normed}") def __process_data( unzipped_dataset_path, metadata, min_duration, max_duration, val_size, test_size, seed_for_ds_split ): logging.info("Preparing JSON train/val/test splits.") entries = list() not_found_wavs = list() wrong_duration_wavs = list() for metadata_fname in metadata: meta_file = unzipped_dataset_path / metadata_fname with open(meta_file, 'r') as fmeta: for line in tqdm(fmeta): items = line.strip().split('|') wav_file_stem, text = items[0], items[1] wav_file = unzipped_dataset_path / "wavs" / f"{wav_file_stem}.wav" # skip audios if they do not exist. if not wav_file.exists(): not_found_wavs.append(wav_file) logging.warning(f"Skipping {wav_file}: it is not found.") continue # skip audios if their duration is out of range. duration = subprocess.check_output(f"soxi -D {wav_file}", shell=True) duration = float(duration) if min_duration <= duration <= max_duration: entry = { 'audio_filepath': str(wav_file), 'duration': duration, 'text': text, } entries.append(entry) elif duration < min_duration: wrong_duration_wavs.append(wav_file) logging.warning(f"Skipping {wav_file}: it is too short, less than {min_duration} seconds.") continue else: wrong_duration_wavs.append(wav_file) logging.warning(f"Skipping {wav_file}: it is too long, greater than {max_duration} seconds.") continue random.Random(seed_for_ds_split).shuffle(entries) train_size = len(entries) - val_size - test_size if train_size <= 0: raise ValueError("Not enough data for the train split.") logging.info("Preparing JSON train/val/test splits is complete.") train, val, test = ( entries[:train_size], entries[train_size : train_size + val_size], entries[train_size + val_size :], ) return train, val, test, not_found_wavs, wrong_duration_wavs def main(): args = get_args() data_root = args.data_root manifests_root = args.manifests_root data_version = args.data_version dataset_root = data_root / f"ThorstenVoice-Dataset-{data_version}" dataset_root.mkdir(parents=True, exist_ok=True) # download and extract dataset dataset_url = THORSTEN_NEUTRAL[data_version]["url"] zipped_dataset_path = dataset_root / Path(dataset_url).name.split("?")[0] __maybe_download_file(dataset_url, zipped_dataset_path) __extract_file(zipped_dataset_path, dataset_root) # generate train/dev/test splits unzipped_dataset_path = dataset_root / THORSTEN_NEUTRAL[data_version]["dir_name"] entries_train, entries_val, entries_test, not_found_wavs, wrong_duration_wavs = __process_data( unzipped_dataset_path=unzipped_dataset_path, metadata=THORSTEN_NEUTRAL[data_version]["metadata"], min_duration=args.min_duration, max_duration=args.max_duration, val_size=args.val_size, test_size=args.test_size, seed_for_ds_split=args.seed_for_ds_split, ) # save json splits. train_json = manifests_root / "train_manifest.json" val_json = manifests_root / "val_manifest.json" test_json = manifests_root / "test_manifest.json" __save_json(train_json, entries_train) __save_json(val_json, entries_val) __save_json(test_json, entries_test) # save skipped audios that are not found into a file. if len(not_found_wavs) > 0: skipped_not_found_file = manifests_root / "skipped_not_found_wavs.list" with open(skipped_not_found_file, "w") as f_notfound: for line in not_found_wavs: f_notfound.write(f"{line}\n") # save skipped audios that are too short or too long into a file. if len(wrong_duration_wavs) > 0: skipped_wrong_duration_file = manifests_root / "skipped_wrong_duration_wavs.list" with open(skipped_wrong_duration_file, "w") as f_wrong_dur: for line in wrong_duration_wavs: f_wrong_dur.write(f"{line}\n") # normalize text if requested. New json file, train_manifest_text_normed.json, will be generated. if args.normalize_text: __text_normalization(train_json, args.num_workers) __text_normalization(val_json, args.num_workers) __text_normalization(test_json, args.num_workers) if __name__ == "__main__": main()