VoiceStar / data /ll60k_preprocessing /step6_forced_alignment.py
mrfakename's picture
Upload 51 files
82bc972 verified
import os, sys
import subprocess, tqdm
from concurrent.futures import ThreadPoolExecutor
def align_folders(audio_root, subfolder, subsubfolder):
# Construct output folder path
file_root = os.path.dirname(audio_root)
out_folder = f"{file_root}/alignment/{subfolder}/{subsubfolder}"
# Create the output directory
os.makedirs(out_folder, exist_ok=True)
# Construct the MFA align command
command = [
"mfa", "align", "--single_speaker", "-j", "8", "--clean",
f"{audio_root}/{subfolder}/{subsubfolder}", "english_us_arpa", "english_us_arpa",
out_folder, "--beam", "50", "--retry_beam", "400", "--output_format", "csv"
]
# Run the command
subprocess.run(command, check=True)
def main(file_root = "/data/scratch/pyp/datasets/librilight/librilight_example_preprocessed", max_parallel_jobs=10, max_spk=100, partition="1/10", n_workers=64):
# Find all subfolder/subsubfolder combinations
tasks = []
audio_root = os.path.join(file_root, "audio")
for subfolder in os.listdir(audio_root):
subfolder_path = os.path.join(audio_root, subfolder)
if os.path.isdir(subfolder_path):
for subsubfolder in os.listdir(subfolder_path):
subsubfolder_path = os.path.join(subfolder_path, subsubfolder)
if os.path.isdir(subsubfolder_path):
tasks.append((audio_root, subfolder, subsubfolder))
speaker_folder_map = {}
for audio_root, subfolder, subsubfolder in tasks:
if os.path.join(audio_root, subfolder) not in speaker_folder_map:
speaker_folder_map[os.path.join(audio_root, subfolder)] = [os.path.join(audio_root, subfolder, subsubfolder)]
else:
speaker_folder_map[os.path.join(audio_root, subfolder)].append(os.path.join(audio_root, subfolder, subsubfolder))
speaker_folder_partitions = []
for audio_root_subfolder, speaker_folders in speaker_folder_map.items():
speaker_folder_partitions.extend([speaker_folders[i:i+max_spk] for i in range(0, len(speaker_folders), max_spk)])
s, e = partition.split("/")
s, e = int(s)-1, int(e)
cur_tasks = speaker_folder_partitions[s::e]
import secrets, string
import soundfile, glob
from joblib import Parallel, delayed
def delete_corrupted(fn):
try:
x = soundfile.read(fn)
except:
print(f"removing corrupted file: {fn}")
os.remove(fn)
for j, task in enumerate(tqdm.tqdm(cur_tasks)):
# get subfolder for the current task
subs = [item.split("/")[-2] for item in task]
# assert that all subs are the same
assert len(set(subs)) == 1, subs
sub = subs[0]
# randomly generate a foldername
# generate a random character
# make softlink from item in task to temp folder
random_string = ''.join(secrets.choice(string.ascii_letters + string.digits) for i in range(10))
temp_folder = os.path.join(file_root, "softlink_audio", random_string)
os.makedirs(temp_folder, exist_ok=True)
out_folder = f"{file_root}/alignment/{sub}"
all_out_speaker_folders = [os.path.join(out_folder, os.path.basename(item)) for item in task]
if sum(os.path.isdir(curpath) for curpath in all_out_speaker_folders) == len(all_out_speaker_folders):
continue
# remove audio files that are corrupted
all_audio_files = [audiofile for item in task for audiofile in glob.glob(item+"/*/*.flac")]
Parallel(n_jobs=n_workers)(delayed(delete_corrupted)(audiofn) for audiofn in all_audio_files)
for item in task:
# make softlink from subsubfolder to a new folder in temp folder
os.symlink(item, os.path.join(temp_folder, os.path.basename(item)))
# run mfa on the linked folder, but save alignment to the correct folder
command = f"mfa align -j {n_workers} {temp_folder} english_us_arpa english_us_arpa {out_folder} --beam 50 --retry_beam 200 --output_format csv --quiet --use_mp --temporary_directory {temp_folder}_temp"
os.system(command)
# delete the temp_folder
os.system(f"rm -r {temp_folder}")
if __name__ == "__main__":
import fire
fire.Fire(main)