NeMo / scripts /dataset_processing /tts /hifitts /get_data.py

thanks to NVIDIA ❤

7934b29 about 2 years ago

4.39 kB

	# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import argparse
	import glob
	import json
	import re
	import tarfile
	import urllib.request
	from pathlib import Path

	from tqdm import tqdm


	def get_args():
	parser = argparse.ArgumentParser(description='Download HiFiTTS and create manifests with predefined split')
	parser.add_argument(
	"--data-root",
	required=True,
	type=Path,
	help='Directory into which to download and extract dataset. \{data-root\}/hi_fi_tts_v0 will be created.',
	)
	parser.add_argument(
	'--split',
	type=str,
	default='all',
	help='Choose to generate manifest for all or one of (train, test, split), note that this will still download the full dataset.',
	)

	args = parser.parse_args()
	return args


	URL = "https://us.openslr.org/resources/109/hi_fi_tts_v0.tar.gz"


	def __maybe_download_file(source_url, destination_path):
	if not destination_path.exists():
	tmp_file_path = destination_path.with_suffix('.tmp')
	urllib.request.urlretrieve(source_url, filename=str(tmp_file_path))
	tmp_file_path.rename(destination_path)


	def __extract_file(filepath, data_dir):
	try:
	tar = tarfile.open(filepath)
	tar.extractall(data_dir)
	tar.close()
	except Exception:
	print(f"Error while extracting {filepath}. Already extracted?")


	def __process_data(data_root, filelists):
	# Create manifests (based on predefined NVIDIA's split)
	for split in tqdm(filelists):
	manifest_target = data_root / f"{split}_manifest.json"
	print(f"Creating manifest for {split}.")

	entries = []
	for manifest_src in glob.glob(str(data_root / f"*_{split}.json")):
	try:
	search_res = re.search('.\/([0-9]+)_manifest_([a-z]+)_..json', manifest_src)
	speaker_id = search_res.group(1)
	audio_quality = search_res.group(2)
	except Exception:
	print(f"Failed to find speaker id or audio quality for {manifest_src}, check formatting.")
	continue

	with open(manifest_src, 'r') as f_in:
	for input_json_entry in f_in:
	data = json.loads(input_json_entry)

	# Make sure corresponding wavfile exists
	wav_path = data_root / data['audio_filepath']
	assert wav_path.exists(), f"{wav_path} does not exist!"

	entry = {
	'audio_filepath': data['audio_filepath'],
	'duration': data['duration'],
	'text': data['text'],
	'normalized_text': data['text_normalized'],
	'speaker': int(speaker_id),
	# Audio_quality is either clean or other.
	# The clean set includes recordings with high sound-to-noise ratio and wide bandwidth.
	# The books with noticeable noise or narrow bandwidth are included in the other subset.
	# Note: some speaker_id's have both clean and other audio quality.
	'audio_quality': audio_quality,
	}
	entries.append(entry)

	with open(manifest_target, 'w') as f_out:
	for m in entries:
	f_out.write(json.dumps(m) + '\n')


	def main():
	args = get_args()

	split = ['train', 'dev', 'test'] if args.split == 'all' else list(args.split)

	tarred_data_path = args.data_root / "hi_fi_tts_v0.tar.gz"

	__maybe_download_file(URL, tarred_data_path)
	__extract_file(str(tarred_data_path), str(args.data_root))

	data_root = args.data_root / "hi_fi_tts_v0"
	__process_data(data_root, split)


	if __name__ == '__main__':
	main()