|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
During inference, we perform frame-level prediction by two approaches: |
|
1) shift the window of length window_length_in_sec (e.g. 0.63s) by shift_length_in_sec (e.g. 10ms) to generate the frame and use the prediction of the window to represent the label for the frame; |
|
[this script demonstrate how to do this approach] |
|
2) generate predictions with overlapping input segments. Then a smoothing filter is applied to decide the label for a frame spanned by multiple segments. |
|
[get frame level prediction by this script and use vad_overlap_posterior.py in NeMo/scripts/voice_activity_detection |
|
One can also find posterior about converting frame level prediction |
|
to speech/no-speech segment in start and end times format in that script.] |
|
|
|
Image https://raw.githubusercontent.com/NVIDIA/NeMo/main/tutorials/asr/images/vad_post_overlap_diagram.png |
|
will help you understand this method. |
|
|
|
This script will also help you perform postprocessing and generate speech segments if needed |
|
|
|
Usage: |
|
python vad_infer.py --config-path="../conf/vad" --config-name="vad_inference_postprocessing.yaml" dataset=<Path of json file of evaluation data. Audio files should have unique names> |
|
|
|
""" |
|
import json |
|
import os |
|
|
|
import torch |
|
|
|
from nemo.collections.asr.parts.utils.speaker_utils import write_rttm2manifest |
|
from nemo.collections.asr.parts.utils.vad_utils import ( |
|
generate_overlap_vad_seq, |
|
generate_vad_frame_pred, |
|
generate_vad_segment_table, |
|
init_vad_model, |
|
prepare_manifest, |
|
) |
|
from nemo.core.config import hydra_runner |
|
from nemo.utils import logging |
|
|
|
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
@hydra_runner(config_path="../conf/vad", config_name="vad_inference_postprocessing.yaml") |
|
def main(cfg): |
|
if not cfg.dataset: |
|
raise ValueError("You must input the path of json file of evaluation data") |
|
|
|
|
|
key_meta_map = {} |
|
with open(cfg.dataset, 'r') as manifest: |
|
for line in manifest.readlines(): |
|
audio_filepath = json.loads(line.strip())['audio_filepath'] |
|
uniq_audio_name = audio_filepath.split('/')[-1].rsplit('.', 1)[0] |
|
if uniq_audio_name in key_meta_map: |
|
raise ValueError("Please make sure each line is with different audio_filepath! ") |
|
key_meta_map[uniq_audio_name] = {'audio_filepath': audio_filepath} |
|
|
|
|
|
manifest_vad_input = cfg.dataset |
|
if cfg.prepare_manifest.auto_split: |
|
logging.info("Split long audio file to avoid CUDA memory issue") |
|
logging.debug("Try smaller split_duration if you still have CUDA memory issue") |
|
config = { |
|
'input': manifest_vad_input, |
|
'window_length_in_sec': cfg.vad.parameters.window_length_in_sec, |
|
'split_duration': cfg.prepare_manifest.split_duration, |
|
'num_workers': cfg.num_workers, |
|
'prepared_manifest_vad_input': cfg.prepared_manifest_vad_input, |
|
} |
|
manifest_vad_input = prepare_manifest(config) |
|
else: |
|
logging.warning( |
|
"If you encounter CUDA memory issue, try splitting manifest entry by split_duration to avoid it." |
|
) |
|
|
|
torch.set_grad_enabled(False) |
|
vad_model = init_vad_model(cfg.vad.model_path) |
|
|
|
|
|
vad_model.setup_test_data( |
|
test_data_config={ |
|
'vad_stream': True, |
|
'sample_rate': 16000, |
|
'manifest_filepath': manifest_vad_input, |
|
'labels': ['infer',], |
|
'num_workers': cfg.num_workers, |
|
'shuffle': False, |
|
'window_length_in_sec': cfg.vad.parameters.window_length_in_sec, |
|
'shift_length_in_sec': cfg.vad.parameters.shift_length_in_sec, |
|
'trim_silence': False, |
|
'normalize_audio': cfg.vad.parameters.normalize_audio, |
|
} |
|
) |
|
|
|
vad_model = vad_model.to(device) |
|
vad_model.eval() |
|
|
|
if not os.path.exists(cfg.frame_out_dir): |
|
os.mkdir(cfg.frame_out_dir) |
|
else: |
|
logging.warning( |
|
"Note frame_out_dir exists. If new file has same name as file inside existing folder, it will append result to existing file and might cause mistakes for next steps." |
|
) |
|
|
|
logging.info("Generating frame level prediction ") |
|
pred_dir = generate_vad_frame_pred( |
|
vad_model=vad_model, |
|
window_length_in_sec=cfg.vad.parameters.window_length_in_sec, |
|
shift_length_in_sec=cfg.vad.parameters.shift_length_in_sec, |
|
manifest_vad_input=manifest_vad_input, |
|
out_dir=cfg.frame_out_dir, |
|
) |
|
logging.info( |
|
f"Finish generating VAD frame level prediction with window_length_in_sec={cfg.vad.parameters.window_length_in_sec} and shift_length_in_sec={cfg.vad.parameters.shift_length_in_sec}" |
|
) |
|
frame_length_in_sec = cfg.vad.parameters.shift_length_in_sec |
|
|
|
|
|
if cfg.vad.parameters.smoothing: |
|
|
|
|
|
logging.info("Generating predictions with overlapping input segments") |
|
smoothing_pred_dir = generate_overlap_vad_seq( |
|
frame_pred_dir=pred_dir, |
|
smoothing_method=cfg.vad.parameters.smoothing, |
|
overlap=cfg.vad.parameters.overlap, |
|
window_length_in_sec=cfg.vad.parameters.window_length_in_sec, |
|
shift_length_in_sec=cfg.vad.parameters.shift_length_in_sec, |
|
num_workers=cfg.num_workers, |
|
out_dir=cfg.smoothing_out_dir, |
|
) |
|
logging.info( |
|
f"Finish generating predictions with overlapping input segments with smoothing_method={cfg.vad.parameters.smoothing} and overlap={cfg.vad.parameters.overlap}" |
|
) |
|
pred_dir = smoothing_pred_dir |
|
frame_length_in_sec = 0.01 |
|
|
|
|
|
if cfg.gen_seg_table: |
|
logging.info("Converting frame level prediction to speech/no-speech segment in start and end times format.") |
|
table_out_dir = generate_vad_segment_table( |
|
vad_pred_dir=pred_dir, |
|
postprocessing_params=cfg.vad.parameters.postprocessing, |
|
frame_length_in_sec=frame_length_in_sec, |
|
num_workers=cfg.num_workers, |
|
out_dir=cfg.table_out_dir, |
|
) |
|
logging.info( |
|
f"Finish generating speech semgents table with postprocessing_params: {cfg.vad.parameters.postprocessing}" |
|
) |
|
|
|
if cfg.write_to_manifest: |
|
for i in key_meta_map: |
|
key_meta_map[i]['rttm_filepath'] = os.path.join(table_out_dir, i + ".txt") |
|
|
|
if not cfg.out_manifest_filepath: |
|
out_manifest_filepath = "vad_out.json" |
|
else: |
|
out_manifest_filepath = cfg.out_manifest_filepath |
|
out_manifest_filepath = write_rttm2manifest(key_meta_map, out_manifest_filepath) |
|
logging.info(f"Writing VAD output to manifest: {out_manifest_filepath}") |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|