#!/bin/bash # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # default values for optional arguments MIN_SCORE=-2 CUT_PREFIX=0 SCRIPTS_DIR="scripts" # //NeMo/tools/ctc_segmentation/tools/scripts/ directory OFFSET=0 LANGUAGE='en' # 'en', 'es', 'ru'... MAX_SEGMENT_LEN=30 ADDITIONAL_SPLIT_SYMBOLS=":|;" USE_NEMO_NORMALIZATION='True' NUM_JOBS=-2 # The maximum number of concurrently running jobs, `-2` - all CPUs but one are used SAMPLE_RATE=16000 # Target sample rate (default for ASR data - 16000 Hz) MAX_DURATION=20 # Maximum audio segment duration, in seconds. Samples that are longer will be dropped. for ARG in "$@" do key=$(echo $ARG | cut -f1 -d=) value=$(echo $ARG | cut -f2 -d=) if [[ $key == *"--"* ]]; then v="${key/--/}" declare $v="${value}" fi done echo "MODEL_NAME_OR_PATH = $MODEL_NAME_OR_PATH" echo "DATA_DIR = $DATA_DIR" echo "OUTPUT_DIR = $OUTPUT_DIR" echo "MIN_SCORE = $MIN_SCORE" echo "CUT_PREFIX = $CUT_PREFIX" echo "SCRIPTS_DIR = $SCRIPTS_DIR" echo "OFFSET = $OFFSET" echo "LANGUAGE = $LANGUAGE" echo "MIN_SEGMENT_LEN = $MIN_SEGMENT_LEN" echo "MAX_SEGMENT_LEN = $MAX_SEGMENT_LEN" echo "SAMPLE_RATE = $SAMPLE_RATE" echo "ADDITIONAL_SPLIT_SYMBOLS = $ADDITIONAL_SPLIT_SYMBOLS" echo "USE_NEMO_NORMALIZATION = $USE_NEMO_NORMALIZATION" if [[ -z $MODEL_NAME_OR_PATH ]] || [[ -z $DATA_DIR ]] || [[ -z $OUTPUT_DIR ]]; then echo "Usage: $(basename "$0") --MODEL_NAME_OR_PATH=[model_name_or_path] --DATA_DIR=[data_dir] --OUTPUT_DIR=[output_dir] --LANGUAGE=[language (Optional)] --OFFSET=[offset value (Optional)] --CUT_PREFIX=[cut prefix in sec (Optional)] --SCRIPTS_DIR=[scripts_dir_path (Optional)] --MAX_SEGMENT_LEN=[max number of characters of the text segment for alignment (Optional)] --ADDITIONAL_SPLIT_SYMBOLS=[Additional symbols to use for sentence split if eos sentence split resulted in sequence longer than --max_length. Use '|' as a separator between symbols, for example: ';|:' (Optional)] --USE_NEMO_NORMALIZATION Set to 'True' to use NeMo Normalization tool to convert numbers from written to spoken format. By default num2words package will be used. (Optional)" exit 1 fi NEMO_NORMALIZATION="" if [[ ${USE_NEMO_NORMALIZATION,,} == "true" ]]; then NEMO_NORMALIZATION="--use_nemo_normalization " fi # STEP #1 # Prepare text and audio data for segmentation echo "TEXT AND AUDIO PREPROCESSING..." python $SCRIPTS_DIR/prepare_data.py \ --in_text=$DATA_DIR/text \ --audio_dir=$DATA_DIR/audio \ --output_dir=$OUTPUT_DIR/processed/ \ --language=$LANGUAGE \ --cut_prefix=$CUT_PREFIX \ --model=$MODEL_NAME_OR_PATH \ --max_length=$MAX_SEGMENT_LEN \ --sample_rate=$SAMPLE_RATE \ --additional_split_symbols=$ADDITIONAL_SPLIT_SYMBOLS $NEMO_NORMALIZATION || exit # STEP #2 # Run CTC-segmentation. One might want to perform alignment with various window sizes # Note, if the alignment with the initial window size isn't found, the window size will be double to re-attempt alignment echo "SEGMENTATION STEP..." for WINDOW in 8000 12000 do python $SCRIPTS_DIR/run_ctc_segmentation.py \ --output_dir=$OUTPUT_DIR \ --data=$OUTPUT_DIR/processed \ --sample_rate=$SAMPLE_RATE \ --model=$MODEL_NAME_OR_PATH \ --window_len $WINDOW || exit done # STEP #3 (Optional) # Verify aligned segments only if multiple WINDOWs used in the Step #2) echo "VERIFYING SEGMENTS..." python $SCRIPTS_DIR/verify_segments.py \ --base_dir=$OUTPUT_DIR || exit # STEP #4 # Cut the original audio files based on the alignment score. Only segments with alignment confidence score # above the MIN_SCORE value will be saved to $OUTPUT_DIR/manifests/manifest.json echo "CUTTING AUDIO..." python $SCRIPTS_DIR/cut_audio.py \ --output_dir=$OUTPUT_DIR \ --alignment=$OUTPUT_DIR/verified_segments \ --threshold=$MIN_SCORE \ --offset=$OFFSET \ --sample_rate=$SAMPLE_RATE \ --max_duration=$MAX_DURATION || exit