Spaces:
Running
on
Zero
Running
on
Zero
# we have raw transcript at | |
# /data/scratch/pyp/datasets/librilight/preprocessed/audio | |
# we have word and ARPA alignment at | |
# /data/scratch/pyp/datasets/librilight/preprocessed/alignment | |
# we have manifest at /data/scratch/pyp/datasets/librilight/preprocessed/manifest_mimi | |
# where each row is like large/10022/essayoncriticism_1505_librivox_64kb_mp3/essayoncriticism_01_pope_64kb_5_610.32_630.08.flac 19.76 | |
# we want to create IPA alignment from the raw transcript and word alignment, using phonemizer | |
# save at /data/scratch/pyp/datasets/librilight/preprocessed/ipa_alignment | |
# since ipa phonemized results are 1-to-1 with words (10 words might lead to a ipa sequence of 7 phonemes), we have to run phonemizer on each segment of the word sequence | |
import os, string, csv, random, tqdm, glob | |
from tokenizer import TextTokenizer, tokenize_text | |
def remove_punctuation(input_string): | |
translator = str.maketrans('', '', string.punctuation) | |
return input_string.translate(translator) | |
def create_alignment(fn, trans_dir, align_dir, audio_ext, trans_ext, arpa_ext, text_tokenizer, use_prob, ipa_alignment_fn, save=False, prompt_dur=30): | |
os.makedirs(os.path.dirname(ipa_alignment_fn), exist_ok=True) | |
trans_fn = os.path.join(trans_dir, fn.replace(audio_ext, trans_ext)) | |
if not os.path.isfile(trans_fn): | |
return [], True | |
align_fn = os.path.join(align_dir, fn.replace(audio_ext, arpa_ext)) | |
if not os.path.isfile(align_fn): | |
return [], True | |
# get raw transcript | |
with open(trans_fn, 'r') as f: | |
transcript = f.read().strip() | |
raw_word_list = transcript.split(" ") | |
# get word alignment | |
with open(align_fn, 'r') as f: | |
word_alignment = csv.reader(f) | |
word_alignment = [row for row in word_alignment if row[3]=='words'] | |
ipa_alignment = [] | |
for j, (item, raw_word) in enumerate(zip(word_alignment, raw_word_list)): | |
start, end, word = float(item[0]), float(item[1]), item[2] | |
if end > prompt_dur: | |
break | |
punc_re_raw_word = remove_punctuation(raw_word) | |
if not remove_punctuation(word).lower() == punc_re_raw_word.lower(): | |
# print(f"word from alignment csv: {word}, word from txt: {raw_word}") | |
return ipa_alignment, True | |
if random.random() < use_prob: | |
cur_words = " ".join(raw_word_list[:j+1]) | |
phn = tokenize_text(text_tokenizer, cur_words) | |
if len(phn) == 0: | |
continue | |
phn = " ".join(phn) | |
start = 0 # at this point, we always start from the beginning of the sentence | |
ipa_alignment.append([start, end, phn]) | |
if save: | |
if ipa_alignment: | |
with open(ipa_alignment_fn, 'w') as f: | |
for item in ipa_alignment: | |
f.write(f"{item[0]}\t{item[1]}\t{item[2]}\n") | |
else: | |
return ipa_alignment, False | |
def main( | |
data_root: str = '/data/scratch/pyp/datasets/librilight/preprocessed', | |
audio_ext: str = '.flac', | |
arpa_ext: str = '.csv', | |
trans_ext: str = '.txt', | |
split: str = 'valid', | |
use_prob: float = 0.5, | |
max_dur: float = 30., # do not consider utterance longer than this | |
prompt_dur: float = 30., # do not consider prompt longer than this | |
): | |
text_tokenizer = TextTokenizer() | |
trans_dir = f'{data_root}/audio' | |
align_dir = f'{data_root}/alignment' | |
manifest_fn = f"{data_root}/manifest_final_encodec/{split}*=*.txt" | |
manifest_fns = glob.glob(manifest_fn) | |
target_dir = f'{data_root}/ipa_alignment' | |
encodec_sr = 50 | |
os.makedirs(target_dir, exist_ok=True) | |
manifest = [] | |
for manifest_fn in manifest_fns: | |
with open(manifest_fn, 'r') as f: | |
temp = [l.strip().split("\t") for l in f.readlines()] | |
manifest += [l[0] + audio_ext for l in temp if float(l[1])/encodec_sr < max_dur] | |
# # sequential processing | |
n_flags = 0 | |
zero_words = 0 | |
for j, fn in enumerate(tqdm.tqdm(manifest)): | |
ipa_alignment_fn = os.path.join(target_dir, fn.replace(audio_ext, '.txt')) | |
ipa_alignment, flag = create_alignment(fn, trans_dir, align_dir, audio_ext, trans_ext, arpa_ext, text_tokenizer, use_prob, ipa_alignment_fn, prompt_dur=prompt_dur) | |
n_flags += flag | |
if not ipa_alignment: | |
zero_words += 1 | |
# print(f"{n_flags} out of {j+1} utterances have mismatched words") | |
# print(f"{zero_words} out of {j+1} utterances have zero words") | |
if ipa_alignment: | |
with open(ipa_alignment_fn, 'w') as f: | |
for item in ipa_alignment: | |
f.write(f"{item[0]}\t{item[1]}\t{item[2]}\n") | |
# # # # do the above using joblib parallisim | |
# print(f"Processing {len(manifest)} utterances") | |
# from joblib import Parallel, delayed | |
# Parallel(n_jobs=32, verbose=2)(delayed(create_alignment)(fn, trans_dir, align_dir, audio_ext, trans_ext, arpa_ext, text_tokenizer, use_prob, os.path.join(target_dir, fn.replace(audio_ext, '.txt')), save=True) for fn in manifest) | |
if __name__ == "__main__": | |
import fire | |
fire.Fire(main) |