camenduru's picture
thanks to NVIDIA ❤
7934b29
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import tarfile
import urllib.request
from pathlib import Path
import sox
import wget
from nemo_text_processing.text_normalization.normalize import Normalizer
from tqdm import tqdm
def get_args():
parser = argparse.ArgumentParser(description='Download LJSpeech and create manifests with predefined split')
parser.add_argument("--data-root", required=True, type=Path)
parser.add_argument('--whitelist-path', type=str, default="lj_speech.tsv")
args = parser.parse_args()
return args
URL = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"
FILELIST_BASE = 'https://raw.githubusercontent.com/NVIDIA/tacotron2/master/filelists'
def __maybe_download_file(source_url, destination_path):
if not destination_path.exists():
tmp_file_path = destination_path.with_suffix('.tmp')
urllib.request.urlretrieve(source_url, filename=str(tmp_file_path))
tmp_file_path.rename(destination_path)
def __extract_file(filepath, data_dir):
try:
tar = tarfile.open(filepath)
tar.extractall(data_dir)
tar.close()
except Exception:
print(f"Error while extracting {filepath}. Already extracted?")
def __process_data(data_root, whitelist_path):
if whitelist_path is None:
wget.download(
"https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tts_dataset_files/en/whitelist_lj_speech.tsv",
out=str(data_root),
)
whitelist_path = data_root / "lj_speech.tsv"
text_normalizer = Normalizer(
lang="en",
input_case="cased",
whitelist=whitelist_path,
overwrite_cache=True,
cache_dir=data_root / "cache_dir",
)
text_normalizer_call_kwargs = {"punct_pre_process": True, "punct_post_process": True}
normalizer_call = lambda x: text_normalizer.normalize(x, **text_normalizer_call_kwargs)
# Create manifests (based on predefined NVIDIA's split)
filelists = ['train', 'val', 'test']
for split in tqdm(filelists):
# Download file list if necessary
filelist_path = data_root / f"ljs_audio_text_{split}_filelist.txt"
if not filelist_path.exists():
wget.download(f"{FILELIST_BASE}/ljs_audio_text_{split}_filelist.txt", out=str(data_root))
manifest_target = data_root / f"{split}_manifest.json"
with open(manifest_target, 'w') as f_out:
with open(filelist_path, 'r') as filelist:
print(f"\nCreating {manifest_target}...")
for line in tqdm(filelist):
basename = line[6:16]
text = line[21:].strip()
norm_text = normalizer_call(text)
# Make sure corresponding wavfile exists
wav_path = data_root / 'wavs' / f"{basename}.wav"
assert wav_path.exists(), f"{wav_path} does not exist!"
entry = {
'audio_filepath': str(wav_path),
'duration': sox.file_info.duration(wav_path),
'text': text,
'normalized_text': norm_text,
}
f_out.write(json.dumps(entry) + '\n')
def main():
args = get_args()
tarred_data_path = args.data_root / "LJSpeech-1.1.tar.bz2"
__maybe_download_file(URL, tarred_data_path)
__extract_file(str(tarred_data_path), str(args.data_root))
data_root = args.data_root / "LJSpeech-1.1"
whitelist_path = args.whitelist_path
__process_data(data_root, whitelist_path)
if __name__ == '__main__':
main()