NeMo / scripts /dataset_processing /tts /ljspeech /get_data.py

thanks to NVIDIA ❤

7934b29 about 2 years ago

4.25 kB

	# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import argparse
	import json
	import tarfile
	import urllib.request
	from pathlib import Path

	import sox
	import wget
	from nemo_text_processing.text_normalization.normalize import Normalizer
	from tqdm import tqdm


	def get_args():
	parser = argparse.ArgumentParser(description='Download LJSpeech and create manifests with predefined split')
	parser.add_argument("--data-root", required=True, type=Path)
	parser.add_argument('--whitelist-path', type=str, default="lj_speech.tsv")

	args = parser.parse_args()
	return args


	URL = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"
	FILELIST_BASE = 'https://raw.githubusercontent.com/NVIDIA/tacotron2/master/filelists'


	def __maybe_download_file(source_url, destination_path):
	if not destination_path.exists():
	tmp_file_path = destination_path.with_suffix('.tmp')
	urllib.request.urlretrieve(source_url, filename=str(tmp_file_path))
	tmp_file_path.rename(destination_path)


	def __extract_file(filepath, data_dir):
	try:
	tar = tarfile.open(filepath)
	tar.extractall(data_dir)
	tar.close()
	except Exception:
	print(f"Error while extracting {filepath}. Already extracted?")


	def __process_data(data_root, whitelist_path):
	if whitelist_path is None:
	wget.download(
	"https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tts_dataset_files/en/whitelist_lj_speech.tsv",
	out=str(data_root),
	)
	whitelist_path = data_root / "lj_speech.tsv"

	text_normalizer = Normalizer(
	lang="en",
	input_case="cased",
	whitelist=whitelist_path,
	overwrite_cache=True,
	cache_dir=data_root / "cache_dir",
	)
	text_normalizer_call_kwargs = {"punct_pre_process": True, "punct_post_process": True}
	normalizer_call = lambda x: text_normalizer.normalize(x, **text_normalizer_call_kwargs)

	# Create manifests (based on predefined NVIDIA's split)
	filelists = ['train', 'val', 'test']
	for split in tqdm(filelists):
	# Download file list if necessary
	filelist_path = data_root / f"ljs_audio_text_{split}_filelist.txt"

	if not filelist_path.exists():
	wget.download(f"{FILELIST_BASE}/ljs_audio_text_{split}_filelist.txt", out=str(data_root))

	manifest_target = data_root / f"{split}_manifest.json"
	with open(manifest_target, 'w') as f_out:
	with open(filelist_path, 'r') as filelist:
	print(f"\nCreating {manifest_target}...")
	for line in tqdm(filelist):
	basename = line[6:16]

	text = line[21:].strip()
	norm_text = normalizer_call(text)

	# Make sure corresponding wavfile exists
	wav_path = data_root / 'wavs' / f"{basename}.wav"
	assert wav_path.exists(), f"{wav_path} does not exist!"

	entry = {
	'audio_filepath': str(wav_path),
	'duration': sox.file_info.duration(wav_path),
	'text': text,
	'normalized_text': norm_text,
	}

	f_out.write(json.dumps(entry) + '\n')


	def main():
	args = get_args()

	tarred_data_path = args.data_root / "LJSpeech-1.1.tar.bz2"

	__maybe_download_file(URL, tarred_data_path)
	__extract_file(str(tarred_data_path), str(args.data_root))

	data_root = args.data_root / "LJSpeech-1.1"
	whitelist_path = args.whitelist_path

	__process_data(data_root, whitelist_path)


	if __name__ == '__main__':
	main()