|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
import json |
|
import tarfile |
|
import urllib.request |
|
from pathlib import Path |
|
|
|
import sox |
|
import wget |
|
from nemo_text_processing.text_normalization.normalize import Normalizer |
|
from tqdm import tqdm |
|
|
|
|
|
def get_args(): |
|
parser = argparse.ArgumentParser(description='Download LJSpeech and create manifests with predefined split') |
|
parser.add_argument("--data-root", required=True, type=Path) |
|
parser.add_argument('--whitelist-path', type=str, default="lj_speech.tsv") |
|
|
|
args = parser.parse_args() |
|
return args |
|
|
|
|
|
URL = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2" |
|
FILELIST_BASE = 'https://raw.githubusercontent.com/NVIDIA/tacotron2/master/filelists' |
|
|
|
|
|
def __maybe_download_file(source_url, destination_path): |
|
if not destination_path.exists(): |
|
tmp_file_path = destination_path.with_suffix('.tmp') |
|
urllib.request.urlretrieve(source_url, filename=str(tmp_file_path)) |
|
tmp_file_path.rename(destination_path) |
|
|
|
|
|
def __extract_file(filepath, data_dir): |
|
try: |
|
tar = tarfile.open(filepath) |
|
tar.extractall(data_dir) |
|
tar.close() |
|
except Exception: |
|
print(f"Error while extracting {filepath}. Already extracted?") |
|
|
|
|
|
def __process_data(data_root, whitelist_path): |
|
if whitelist_path is None: |
|
wget.download( |
|
"https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tts_dataset_files/en/whitelist_lj_speech.tsv", |
|
out=str(data_root), |
|
) |
|
whitelist_path = data_root / "lj_speech.tsv" |
|
|
|
text_normalizer = Normalizer( |
|
lang="en", |
|
input_case="cased", |
|
whitelist=whitelist_path, |
|
overwrite_cache=True, |
|
cache_dir=data_root / "cache_dir", |
|
) |
|
text_normalizer_call_kwargs = {"punct_pre_process": True, "punct_post_process": True} |
|
normalizer_call = lambda x: text_normalizer.normalize(x, **text_normalizer_call_kwargs) |
|
|
|
|
|
filelists = ['train', 'val', 'test'] |
|
for split in tqdm(filelists): |
|
|
|
filelist_path = data_root / f"ljs_audio_text_{split}_filelist.txt" |
|
|
|
if not filelist_path.exists(): |
|
wget.download(f"{FILELIST_BASE}/ljs_audio_text_{split}_filelist.txt", out=str(data_root)) |
|
|
|
manifest_target = data_root / f"{split}_manifest.json" |
|
with open(manifest_target, 'w') as f_out: |
|
with open(filelist_path, 'r') as filelist: |
|
print(f"\nCreating {manifest_target}...") |
|
for line in tqdm(filelist): |
|
basename = line[6:16] |
|
|
|
text = line[21:].strip() |
|
norm_text = normalizer_call(text) |
|
|
|
|
|
wav_path = data_root / 'wavs' / f"{basename}.wav" |
|
assert wav_path.exists(), f"{wav_path} does not exist!" |
|
|
|
entry = { |
|
'audio_filepath': str(wav_path), |
|
'duration': sox.file_info.duration(wav_path), |
|
'text': text, |
|
'normalized_text': norm_text, |
|
} |
|
|
|
f_out.write(json.dumps(entry) + '\n') |
|
|
|
|
|
def main(): |
|
args = get_args() |
|
|
|
tarred_data_path = args.data_root / "LJSpeech-1.1.tar.bz2" |
|
|
|
__maybe_download_file(URL, tarred_data_path) |
|
__extract_file(str(tarred_data_path), str(args.data_root)) |
|
|
|
data_root = args.data_root / "LJSpeech-1.1" |
|
whitelist_path = args.whitelist_path |
|
|
|
__process_data(data_root, whitelist_path) |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|