NeMo / examples /asr /audio_to_audio /process_audio.py

thanks to NVIDIA ❤

7934b29 about 2 years ago

9.49 kB

	# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import contextlib
	import glob
	import json
	import os
	from dataclasses import dataclass, is_dataclass
	from pathlib import Path
	from typing import List, Optional

	import pytorch_lightning as pl
	import torch
	from omegaconf import OmegaConf

	from nemo.collections.asr.models import AudioToAudioModel
	from nemo.core.config import hydra_runner
	from nemo.utils import logging, model_utils


	"""
	Process audio file on a single CPU/GPU. Useful for processing of moderate amounts of audio data.

	# Arguments
	model_path: path to .nemo checkpoint for an AudioToAudioModel
	pretrained_name: name of a pretrained AudioToAudioModel model (from NGC registry)
	audio_dir: path to directory with audio files
	dataset_manifest: path to dataset JSON manifest file (in NeMo format)

	input_channel_selector: list of channels to take from audio files, defaults to `None` and takes all available channels
	input_key: key for audio filepath in the manifest file, defaults to `audio_filepath`

	output_dir: Directory where processed files will be saved
	output_filename: Output filename where manifest pointing to processed files will be written
	batch_size: batch size during inference

	cuda: Optional int to enable or disable execution of model on certain CUDA device.
	amp: Bool to decide if Automatic Mixed Precision should be used during inference
	audio_type: Str filetype of the audio. Supported = wav, flac, mp3

	overwrite_output: Bool which when set allowes repeated processing runs to overwrite previous results.

	# Usage
	AudioToAudioModel can be specified by either `model_path` or `pretrained_name`.
	Data for processing can be defined with either `audio_dir` or `dataset_manifest`.
	Processed audio is saved in `output_dir`, and a manifest for processed files is saved
	in `output_filename`.

	```
	python process_audio.py \
	model_path=null \
	pretrained_name=null \
	audio_dir="" \
	dataset_manifest="" \
	input_channel_selector=[] \
	output_dir="" \
	output_filename="" \
	batch_size=1 \
	cuda=0 \
	amp=True
	```
	"""


	@dataclass
	class ProcessConfig:
	# Required configs
	model_path: Optional[str] = None # Path to a .nemo file
	pretrained_name: Optional[str] = None # Name of a pretrained model
	audio_dir: Optional[str] = None # Path to a directory which contains audio files
	dataset_manifest: Optional[str] = None # Path to dataset's JSON manifest

	# Audio configs
	input_channel_selector: Optional[List] = None # Union types not supported Optional[Union[List, int]]
	input_key: Optional[str] = None # Can be used with a manifest

	# General configs
	output_dir: Optional[str] = None
	output_filename: Optional[str] = None
	batch_size: int = 1
	num_workers: int = 0

	# Override model config
	override_config_path: Optional[str] = None # path to a yaml config that will override the internal config file

	# Set `cuda` to int to define CUDA device. If 'None', will look for CUDA
	# device anyway, and do inference on CPU only if CUDA device is not found.
	# If `cuda` is a negative number, inference will be on CPU only.
	cuda: Optional[int] = None
	amp: bool = False
	audio_type: str = "wav"

	# Recompute model predictions, even if the output folder exists.
	overwrite_output: bool = False


	@hydra_runner(config_name="ProcessConfig", schema=ProcessConfig)
	def main(cfg: ProcessConfig) -> ProcessConfig:
	logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')

	if is_dataclass(cfg):
	cfg = OmegaConf.structured(cfg)

	if cfg.model_path is None and cfg.pretrained_name is None:
	raise ValueError("Both cfg.model_path and cfg.pretrained_name cannot be None!")
	if cfg.audio_dir is None and cfg.dataset_manifest is None:
	raise ValueError("Both cfg.audio_dir and cfg.dataset_manifest cannot be None!")

	# setup GPU
	if cfg.cuda is None:
	if torch.cuda.is_available():
	device = [0] # use 0th CUDA device
	accelerator = 'gpu'
	else:
	device = 1
	accelerator = 'cpu'
	else:
	device = [cfg.cuda]
	accelerator = 'gpu'

	map_location = torch.device('cuda:{}'.format(device[0]) if accelerator == 'gpu' else 'cpu')

	# setup model
	if cfg.model_path is not None:
	# restore model from .nemo file path
	model_cfg = AudioToAudioModel.restore_from(restore_path=cfg.model_path, return_config=True)
	classpath = model_cfg.target # original class path
	imported_class = model_utils.import_class_by_path(classpath) # type: AudioToAudioModel
	logging.info(f"Restoring model : {imported_class.__name__}")
	audio_to_audio_model = imported_class.restore_from(
	restore_path=cfg.model_path, override_config_path=cfg.override_config_path, map_location=map_location
	) # type: AudioToAudioModel
	model_name = os.path.splitext(os.path.basename(cfg.model_path))[0]
	else:
	# restore model by name
	audio_to_audio_model = AudioToAudioModel.from_pretrained(
	model_name=cfg.pretrained_name, map_location=map_location
	) # type: AudioToAudioModel
	model_name = cfg.pretrained_name

	trainer = pl.Trainer(devices=device, accelerator=accelerator)
	audio_to_audio_model.set_trainer(trainer)
	audio_to_audio_model = audio_to_audio_model.eval()

	if cfg.audio_dir is not None:
	filepaths = list(glob.glob(os.path.join(cfg.audio_dir, f"*/.{cfg.audio_type}"), recursive=True))
	else:
	# get filenames from manifest
	filepaths = []
	if os.stat(cfg.dataset_manifest).st_size == 0:
	raise RuntimeError(f"The input dataset_manifest {cfg.dataset_manifest} is empty.")

	input_key = 'audio_filepath' if cfg.input_key is None else cfg.input_key
	manifest_dir = Path(cfg.dataset_manifest).parent
	with open(cfg.dataset_manifest, 'r') as f:
	for line in f:
	item = json.loads(line)
	audio_file = Path(item[input_key])
	if not audio_file.is_file() and not audio_file.is_absolute():
	audio_file = manifest_dir / audio_file
	filepaths.append(str(audio_file.absolute()))

	logging.info(f"\nProcessing {len(filepaths)} files...\n")

	# setup AMP (optional)
	if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
	logging.info("AMP enabled!\n")
	autocast = torch.cuda.amp.autocast
	else:

	@contextlib.contextmanager
	def autocast():
	yield

	# Compute output filename
	if cfg.output_dir is None:
	# create default output filename
	if cfg.audio_dir is not None:
	cfg.output_dir = os.path.dirname(os.path.join(cfg.audio_dir, '.')) + f'_processed_{model_name}'
	else:
	cfg.output_dir = os.path.dirname(cfg.dataset_manifest) + f'_processed_{model_name}'

	# Compute output filename
	if cfg.output_filename is None:
	# create default output filename
	cfg.output_filename = cfg.output_dir.rstrip('/') + '_manifest.json'

	# if transcripts should not be overwritten, and already exists, skip re-transcription step and return
	if not cfg.overwrite_output and os.path.exists(cfg.output_dir):
	raise RuntimeError(
	f"Previous output found at {cfg.output_dir}, and flag `overwrite_output`"
	f"is {cfg.overwrite_output}. Returning without processing."
	)

	# Process audio
	with autocast():
	with torch.no_grad():
	paths2processed_files = audio_to_audio_model.process(
	paths2audio_files=filepaths,
	output_dir=cfg.output_dir,
	batch_size=cfg.batch_size,
	num_workers=cfg.num_workers,
	input_channel_selector=cfg.input_channel_selector,
	)

	logging.info(f"Finished processing {len(filepaths)} files!")
	logging.info(f"Processed audio is available in the output directory: {cfg.output_dir}")

	# Prepare new/updated manifest with a new key for processed audio
	with open(cfg.output_filename, 'w', encoding='utf-8') as f:
	if cfg.dataset_manifest is not None:
	with open(cfg.dataset_manifest, 'r') as fr:
	for idx, line in enumerate(fr):
	item = json.loads(line)
	item['processed_audio_filepath'] = paths2processed_files[idx]
	f.write(json.dumps(item) + "\n")
	else:
	for idx, processed_file in enumerate(paths2processed_files):
	item = {'processed_audio_filepath': processed_file}
	f.write(json.dumps(item) + "\n")

	return cfg


	if __name__ == '__main__':
	main() # noqa pylint: disable=no-value-for-parameter