NeMo / scripts /asr_language_modeling /ngram_lm /create_lexicon_from_arpa.py
camenduru's picture
thanks to NVIDIA ❤
7934b29
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Use this file to create a lexicon file for Flashlight decoding from an existing KenLM arpa file
# A lexicon file is required for Flashlight decoding in most cases, as it acts as a map from the words
# in you arpa file to the representation used by your ASR AM.
# For more details, see: https://github.com/flashlight/flashlight/tree/main/flashlight/app/asr#data-preparation
#
# Usage: python create_lexicon_from_arpa.py --arpa /path/to/english.arpa --model /path/to/model.nemo --lower
#
#
import argparse
import os
import re
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Utility script for generating lexicon file from a KenLM arpa file")
parser.add_argument("--arpa", required=True, help="path to your arpa file")
parser.add_argument("--dst", help="directory to store generated lexicon", default=None)
parser.add_argument("--lower", action='store_true', help="Whether to lowercase the arpa vocab")
parser.add_argument("--model", default=None, help="path to Nemo model for its tokeniser")
args = parser.parse_args()
if not os.path.exists(args.arpa):
print("ARPA file not detected on disk, aborting!", flush=True)
exit(255)
if args.dst is not None:
save_path = args.dst
else:
save_path = os.path.dirname(args.arpa)
os.makedirs(save_path, exist_ok=True)
tokenizer = None
if args.model is not None:
from nemo.collections.asr.models import ASRModel
model = ASRModel.restore_from(restore_path=args.model, map_location='cpu')
if hasattr(model, 'tokenizer'):
tokenizer = model.tokenizer
else:
print('WARNING: supplied Nemo model does not contain a tokenizer', flush=True)
lex_file = os.path.join(save_path, os.path.splitext(os.path.basename(args.arpa))[0] + '.lexicon')
print(f"Writing Lexicon file - {lex_file}...", flush=True)
with open(lex_file, "w", encoding='utf_8', newline='\n') as f:
with open(args.arpa, "r", encoding='utf_8') as arpa:
for line in arpa:
# verify if the line corresponds to unigram
if not re.match(r"[-]*[0-9\.]+\t\S+\t*[-]*[0-9\.]*$", line):
continue
word = line.split("\t")[1]
word = word.strip().lower() if args.lower else word.strip()
if word == "<UNK>" or word == "<unk>" or word == "<s>" or word == "</s>":
continue
if tokenizer is None:
f.write("{w}\t{s}\n".format(w=word, s=" ".join(word)))
else:
f.write("{w}\t{s}\n".format(w=word, s=" ".join(tokenizer.text_to_tokens(word))))
print("Done!", flush=True)