NeMo

File size: 10,664 Bytes

7934b29

# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This script is used to generate JSON manifests for mel-generator model training. The usage is below.

$ python scripts/dataset_processing/tts/thorsten_neutral/get_data.py \
    --data-root ~/experiments/thorsten_neutral \
    --manifests-root ~/experiments/thorsten_neutral \
    --data-version "22_10" \
    --min-duration 0.1 \
    --normalize-text
"""

import argparse
import json
import random
import shutil
import subprocess
import urllib.request
from pathlib import Path

from joblib import Parallel, delayed
from nemo_text_processing.text_normalization.normalize import Normalizer
from tqdm import tqdm

from nemo.utils import logging

# Thorsten Müller published two neural voice datasets, 21.02 and 22.10.
THORSTEN_NEUTRAL = {
    "21_02": {
        "url": "https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1",
        "dir_name": "thorsten-de_v03",
        "metadata": ["metadata.csv"],
    },
    "22_10": {
        "url": "https://zenodo.org/record/7265581/files/ThorstenVoice-Dataset_2022.10.zip?download=1",
        "dir_name": "ThorstenVoice-Dataset_2022.10",
        "metadata": ["metadata_train.csv", "metadata_dev.csv", "metadata_test.csv"],
    },
}


def get_args():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Download Thorsten Müller's neutral voice dataset and create manifests with predefined split. "
        "Thorsten Müller published two neural voice datasets, 21.02 and 22.10, where 22.10 provides better "
        "audio quality. Please choose one of the two for your TTS models. Details about the dataset are "
        "in https://github.com/thorstenMueller/Thorsten-Voice.",
    )
    parser.add_argument("--data-root", required=True, type=Path, help="where the resulting dataset will reside.")
    parser.add_argument("--manifests-root", required=True, type=Path, help="where the manifests files will reside.")
    parser.add_argument("--data-version", default="22_10", choices=["21_02", "22_10"], type=str)
    parser.add_argument("--min-duration", default=0.1, type=float)
    parser.add_argument("--max-duration", default=float('inf'), type=float)
    parser.add_argument("--val-size", default=100, type=int)
    parser.add_argument("--test-size", default=100, type=int)
    parser.add_argument(
        "--num-workers",
        default=-1,
        type=int,
        help="Specify the max number of concurrent Python worker processes. "
        "If -1 all CPUs are used. If 1 no parallel computing is used.",
    )
    parser.add_argument(
        "--normalize-text",
        default=False,
        action='store_true',
        help="Normalize original text and add a new entry 'normalized_text' to .json file if True.",
    )
    parser.add_argument(
        "--seed-for-ds-split",
        default=100,
        type=float,
        help="Seed for deterministic split of train/dev/test, NVIDIA's default is 100.",
    )
    args = parser.parse_args()
    return args


def __maybe_download_file(source_url, destination_path):
    if not destination_path.exists():
        logging.info(f"Downloading data: {source_url} --> {destination_path}")
        tmp_file_path = destination_path.with_suffix(".tmp")
        urllib.request.urlretrieve(source_url, filename=tmp_file_path)
        tmp_file_path.rename(destination_path)
    else:
        logging.info(f"Skipped downloading data because it exists: {destination_path}")


def __extract_file(filepath, data_dir):
    logging.info(f"Unzipping data: {filepath} --> {data_dir}")
    shutil.unpack_archive(filepath, data_dir)
    logging.info(f"Unzipping data is complete: {filepath}.")


def __save_json(json_file, dict_list):
    logging.info(f"Saving JSON split to {json_file}.")
    with open(json_file, "w") as f:
        for d in dict_list:
            f.write(json.dumps(d) + "\n")


def __text_normalization(json_file, num_workers=-1):
    text_normalizer_call_kwargs = {
        "punct_pre_process": True,
        "punct_post_process": True,
    }
    text_normalizer = Normalizer(
        lang="de", input_case="cased", overwrite_cache=True, cache_dir=str(json_file.parent / "cache_dir"),
    )

    def normalizer_call(x):
        return text_normalizer.normalize(x, **text_normalizer_call_kwargs)

    def add_normalized_text(line_dict):
        normalized_text = normalizer_call(line_dict["text"])
        line_dict.update({"normalized_text": normalized_text})
        return line_dict

    logging.info(f"Normalizing text for {json_file}.")
    with open(json_file, 'r', encoding='utf-8') as fjson:
        lines = fjson.readlines()
        # Note: you need to verify which backend works well on your cluster.
        # backend="loky" is fine on multi-core Ubuntu OS; backend="threading" on Slurm.
        dict_list = Parallel(n_jobs=num_workers)(
            delayed(add_normalized_text)(json.loads(line)) for line in tqdm(lines)
        )

    json_file_text_normed = json_file.parent / f"{json_file.stem}_text_normed{json_file.suffix}"
    with open(json_file_text_normed, 'w', encoding="utf-8") as fjson_norm:
        for dct in dict_list:
            fjson_norm.write(json.dumps(dct) + "\n")
    logging.info(f"Normalizing text is complete: {json_file} --> {json_file_text_normed}")


def __process_data(
    unzipped_dataset_path, metadata, min_duration, max_duration, val_size, test_size, seed_for_ds_split
):
    logging.info("Preparing JSON train/val/test splits.")

    entries = list()
    not_found_wavs = list()
    wrong_duration_wavs = list()

    for metadata_fname in metadata:
        meta_file = unzipped_dataset_path / metadata_fname
        with open(meta_file, 'r') as fmeta:
            for line in tqdm(fmeta):
                items = line.strip().split('|')
                wav_file_stem, text = items[0], items[1]
                wav_file = unzipped_dataset_path / "wavs" / f"{wav_file_stem}.wav"

                # skip audios if they do not exist.
                if not wav_file.exists():
                    not_found_wavs.append(wav_file)
                    logging.warning(f"Skipping {wav_file}: it is not found.")
                    continue

                # skip audios if their duration is out of range.
                duration = subprocess.check_output(f"soxi -D {wav_file}", shell=True)
                duration = float(duration)
                if min_duration <= duration <= max_duration:
                    entry = {
                        'audio_filepath': str(wav_file),
                        'duration': duration,
                        'text': text,
                    }
                    entries.append(entry)
                elif duration < min_duration:
                    wrong_duration_wavs.append(wav_file)
                    logging.warning(f"Skipping {wav_file}: it is too short, less than {min_duration} seconds.")
                    continue
                else:
                    wrong_duration_wavs.append(wav_file)
                    logging.warning(f"Skipping {wav_file}: it is too long, greater than {max_duration} seconds.")
                    continue

    random.Random(seed_for_ds_split).shuffle(entries)
    train_size = len(entries) - val_size - test_size
    if train_size <= 0:
        raise ValueError("Not enough data for the train split.")

    logging.info("Preparing JSON train/val/test splits is complete.")
    train, val, test = (
        entries[:train_size],
        entries[train_size : train_size + val_size],
        entries[train_size + val_size :],
    )

    return train, val, test, not_found_wavs, wrong_duration_wavs


def main():
    args = get_args()
    data_root = args.data_root
    manifests_root = args.manifests_root
    data_version = args.data_version

    dataset_root = data_root / f"ThorstenVoice-Dataset-{data_version}"
    dataset_root.mkdir(parents=True, exist_ok=True)

    # download and extract dataset
    dataset_url = THORSTEN_NEUTRAL[data_version]["url"]
    zipped_dataset_path = dataset_root / Path(dataset_url).name.split("?")[0]
    __maybe_download_file(dataset_url, zipped_dataset_path)
    __extract_file(zipped_dataset_path, dataset_root)

    # generate train/dev/test splits
    unzipped_dataset_path = dataset_root / THORSTEN_NEUTRAL[data_version]["dir_name"]
    entries_train, entries_val, entries_test, not_found_wavs, wrong_duration_wavs = __process_data(
        unzipped_dataset_path=unzipped_dataset_path,
        metadata=THORSTEN_NEUTRAL[data_version]["metadata"],
        min_duration=args.min_duration,
        max_duration=args.max_duration,
        val_size=args.val_size,
        test_size=args.test_size,
        seed_for_ds_split=args.seed_for_ds_split,
    )

    # save json splits.
    train_json = manifests_root / "train_manifest.json"
    val_json = manifests_root / "val_manifest.json"
    test_json = manifests_root / "test_manifest.json"
    __save_json(train_json, entries_train)
    __save_json(val_json, entries_val)
    __save_json(test_json, entries_test)

    # save skipped audios that are not found into a file.
    if len(not_found_wavs) > 0:
        skipped_not_found_file = manifests_root / "skipped_not_found_wavs.list"
        with open(skipped_not_found_file, "w") as f_notfound:
            for line in not_found_wavs:
                f_notfound.write(f"{line}\n")

    # save skipped audios that are too short or too long into a file.
    if len(wrong_duration_wavs) > 0:
        skipped_wrong_duration_file = manifests_root / "skipped_wrong_duration_wavs.list"
        with open(skipped_wrong_duration_file, "w") as f_wrong_dur:
            for line in wrong_duration_wavs:
                f_wrong_dur.write(f"{line}\n")

    # normalize text if requested. New json file, train_manifest_text_normed.json, will be generated.
    if args.normalize_text:
        __text_normalization(train_json, args.num_workers)
        __text_normalization(val_json, args.num_workers)
        __text_normalization(test_json, args.num_workers)


if __name__ == "__main__":
    main()