camenduru
/

NeMo

Model card Files Files and versions Community

NeMo / examples /tts /g2p /utils.py

camenduru's picture

thanks to NVIDIA ❤

7934b29 about 2 years ago

history blame contribute delete

3.67 kB

	# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import json

	from nemo.collections.asr.metrics.wer import word_error_rate
	from nemo.collections.tts.models.g2p_ctc import CTCG2PModel
	from nemo.collections.tts.models.g2p_t5 import T5G2PModel
	from nemo.utils import logging


	def get_model(cfg, trainer):
	"""
	Get model instance

	Args:
	cfg: model's config file
	trainer: trainer
	Return:
	G2PModel instance
	"""
	if "CTC" in cfg.name:
	model = CTCG2PModel(cfg=cfg.model, trainer=trainer)
	elif cfg.name == "T5G2P":
	model = T5G2PModel(cfg=cfg.model, trainer=trainer)
	else:
	raise ValueError(f"{cfg.name} is not supported. Choose from [G2P-Conformer-CTC, T5G2P]")
	return model


	def get_metrics(manifest: str, pred_field="pred_text", phoneme_field="text", grapheme_field="text_graphemes"):
	"""
	Calculates WER and PER metrics (for duplicated grapheme entries with multiple reference values,
	the best matching prediction will be used for evaluation.)

	Args:
	manifest: Path to .json manifest file
	pred_field: name of the field in the output_file to save predictions
	phoneme_field: name of the field in manifest_filepath for ground truth phonemes
	grapheme_field: name of the field in manifest_filepath for input grapheme text

	Returns: WER and PER values
	"""
	all_preds = []
	all_references = []
	all_graphemes = {}
	with open(manifest, "r") as f:
	for i, line in enumerate(f):
	line = json.loads(line)
	all_preds.append(line[pred_field])
	all_references.append(line[phoneme_field])

	if line[grapheme_field] not in all_graphemes:
	all_graphemes[line[grapheme_field]] = []
	all_graphemes[line[grapheme_field]].append(i)

	# collect all examples with multiple phoneme options and same grapheme form, choose the one with min PER
	all_graphemes = {k: v for k, v in all_graphemes.items() if len(v) > 1}
	lines_to_drop = []
	for phon_amb_indices in all_graphemes.values():
	refs, preds = [], []
	for phon_amb_indices_ in phon_amb_indices:
	refs.append(all_references[phon_amb_indices_])
	preds.append(all_preds[phon_amb_indices_])
	pers = []
	for ref_, pred_ in zip(refs, preds):
	pers.append(word_error_rate(hypotheses=[pred_], references=[ref_], use_cer=True))

	min_idx = pers.index(min(pers))

	phon_amb_indices.pop(min_idx)
	lines_to_drop.extend(phon_amb_indices)

	# drop duplicated examples, only keep with min PER
	all_preds = [x for i, x in enumerate(all_preds) if i not in lines_to_drop]
	all_references = [x for i, x in enumerate(all_references) if i not in lines_to_drop]

	wer = word_error_rate(hypotheses=all_preds, references=all_references)
	per = word_error_rate(hypotheses=all_preds, references=all_references, use_cer=True)

	logging.info(f"{manifest}: PER: {per * 100:.2f}%, WER: {wer * 100:.2f}%, lines: {len(all_references)}")
	return wer, per