camenduru's picture
thanks to NVIDIA ❤
7934b29
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import glob
import json
import re
import tarfile
import urllib.request
from pathlib import Path
from tqdm import tqdm
def get_args():
parser = argparse.ArgumentParser(description='Download HiFiTTS and create manifests with predefined split')
parser.add_argument(
"--data-root",
required=True,
type=Path,
help='Directory into which to download and extract dataset. \{data-root\}/hi_fi_tts_v0 will be created.',
)
parser.add_argument(
'--split',
type=str,
default='all',
help='Choose to generate manifest for all or one of (train, test, split), note that this will still download the full dataset.',
)
args = parser.parse_args()
return args
URL = "https://us.openslr.org/resources/109/hi_fi_tts_v0.tar.gz"
def __maybe_download_file(source_url, destination_path):
if not destination_path.exists():
tmp_file_path = destination_path.with_suffix('.tmp')
urllib.request.urlretrieve(source_url, filename=str(tmp_file_path))
tmp_file_path.rename(destination_path)
def __extract_file(filepath, data_dir):
try:
tar = tarfile.open(filepath)
tar.extractall(data_dir)
tar.close()
except Exception:
print(f"Error while extracting {filepath}. Already extracted?")
def __process_data(data_root, filelists):
# Create manifests (based on predefined NVIDIA's split)
for split in tqdm(filelists):
manifest_target = data_root / f"{split}_manifest.json"
print(f"Creating manifest for {split}.")
entries = []
for manifest_src in glob.glob(str(data_root / f"*_{split}.json")):
try:
search_res = re.search('.*\/([0-9]+)_manifest_([a-z]+)_.*.json', manifest_src)
speaker_id = search_res.group(1)
audio_quality = search_res.group(2)
except Exception:
print(f"Failed to find speaker id or audio quality for {manifest_src}, check formatting.")
continue
with open(manifest_src, 'r') as f_in:
for input_json_entry in f_in:
data = json.loads(input_json_entry)
# Make sure corresponding wavfile exists
wav_path = data_root / data['audio_filepath']
assert wav_path.exists(), f"{wav_path} does not exist!"
entry = {
'audio_filepath': data['audio_filepath'],
'duration': data['duration'],
'text': data['text'],
'normalized_text': data['text_normalized'],
'speaker': int(speaker_id),
# Audio_quality is either clean or other.
# The clean set includes recordings with high sound-to-noise ratio and wide bandwidth.
# The books with noticeable noise or narrow bandwidth are included in the other subset.
# Note: some speaker_id's have both clean and other audio quality.
'audio_quality': audio_quality,
}
entries.append(entry)
with open(manifest_target, 'w') as f_out:
for m in entries:
f_out.write(json.dumps(m) + '\n')
def main():
args = get_args()
split = ['train', 'dev', 'test'] if args.split == 'all' else list(args.split)
tarred_data_path = args.data_root / "hi_fi_tts_v0.tar.gz"
__maybe_download_file(URL, tarred_data_path)
__extract_file(str(tarred_data_path), str(args.data_root))
data_root = args.data_root / "hi_fi_tts_v0"
__process_data(data_root, split)
if __name__ == '__main__':
main()