NeMo / scripts /dataset_processing /nlp /intent_and_slot /assistant_utils.py

thanks to NVIDIA ❤

7934b29 about 2 years ago

5.97 kB

	# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import os
	import re
	import shutil

	from nemo.collections.nlp.data.data_utils.data_preprocessing import DATABASE_EXISTS_TMP, if_exist, write_files
	from nemo.utils import logging


	def copy_input_files(infold):
	"""
	Put training files in convenient place for conversion to our format.

	Args:
	infold: location of an original fold of the dataset (in the sense of k-fold cross validation)
	"""
	our_infold = infold + "/dataset"

	if os.path.exists(our_infold + "/trainset") and os.path.exists(our_infold + "/testset"):
	logging.info("Input folders exists")
	return

	logging.info(f"Copying files to input folder: {our_infold}")
	os.makedirs(infold, exist_ok=True)

	old_infold = (
	infold + '/CrossValidation/autoGeneFromRealAnno/autoGene_2018_03_22-13_01_25_169/CrossValidation/KFold_1'
	)
	if not os.path.exists(our_infold + "/trainset"):
	shutil.copytree(old_infold + '/trainset', our_infold + '/trainset')

	if not os.path.exists(our_infold + "/testset"):
	shutil.copytree(old_infold + '/testset/csv', our_infold + '/testset')


	def get_intents(infold):
	""" Get list of intents from file names. """
	intents = [f[:-4] for f in os.listdir(infold)]
	intents.sort()
	logging.info(f'Found {len(intents)} intents')

	return intents


	def get_intent_queries(infold, intent_names, mode):
	""" Get list of queries with their corresponding intent number. """
	intent_queries = ['sentence\tlabel\n']

	for index, intent in enumerate(intent_names):
	queries = open(f'{infold}/{mode}set/{intent}.csv', 'r', encoding='utf-8').readlines()
	for query in queries[1:]:
	phrases = query.split(";")
	intent_query = phrases[4][1:-1] + "\t" + str(index)
	intent_queries.append(intent_query)

	return intent_queries


	def get_slots(infold, modes):
	"""
	Find a list of unique slot types in training and testing data.
	We use a single slot type name both for starting and continuation tokens (not using B-, I- notation).
	"""
	slots = set()

	for mode in modes:
	path = f'{infold}/{mode}set'
	for filename in os.listdir(path):
	lines = open(f'{path}/{filename}', 'r', encoding='utf-8').readlines()
	for line in lines[1:]:
	query = line.split(";")[3]
	slot_phrases = re.findall('\[.*?\]', query)
	for slot_phrase in slot_phrases:
	slot = slot_phrase.split(" : ")[0][1:]
	slots.add(slot)

	slots = sorted(slots)
	slots.append("O")
	logging.info(f'Found {len(slots)} slot types')

	return slots


	def get_slot_queries(infold, slot_dict, mode, intent_names):
	"""
	Convert each word in a query to corresponding slot number.
	Args:
	infold: fold of the data
	slot_dict: dict containing slot-names to positions
	mode: train, validation or test
	intent_names: list of intents
	"""
	slot_queries = []
	outside_slot = len(slot_dict) - 1

	# keep the same order of files/queries as for intents
	for intent in intent_names:
	lines = open(f'{infold}/{mode}set/{intent}.csv', 'r', encoding='utf-8').readlines()
	for line in lines[1:]:
	slot_query = ""
	query = line.split(";")[3]
	words = query.split(" ")
	current_slot = outside_slot
	for word in words:
	if word[0] == "[":
	current_slot = slot_dict[word[1:]]
	elif word[0] == ":":
	continue
	else:
	slot_query += str(current_slot) + " "
	if word[-1] == ']':
	current_slot = outside_slot

	slot_queries.append(slot_query.strip())

	return slot_queries


	def process_assistant(infold, outfold, modes=['train', 'test']):
	"""
	https://github.com/xliuhw/NLU-Evaluation-Data - this dataset includes
	about 25 thousand examples with 66 various multi-domain intents and 57 entity types.
	"""
	if if_exist(outfold, [f'{mode}_slots.tsv' for mode in modes]):
	logging.info(DATABASE_EXISTS_TMP.format('robot', outfold))
	return outfold

	logging.info(f'Processing assistant commands dataset and store at {outfold}')
	os.makedirs(outfold, exist_ok=True)

	# copy train/test files to the convenient directory to work with
	copy_input_files(infold)
	infold += "/dataset"

	# get list of intents from train folder (test folder supposed to be the same)
	intent_names = get_intents(infold + "/trainset")
	write_files(intent_names, f'{outfold}/dict.intents.csv')

	# get all train and test queries with their intent
	for mode in modes:
	intent_queries = get_intent_queries(infold, intent_names, mode)
	write_files(intent_queries, f'{outfold}/{mode}.tsv')

	# get list of all unique slots in training and testing files
	slot_types = get_slots(infold, modes)
	write_files(slot_types, f'{outfold}/dict.slots.csv')

	# create files of slot queries
	slot_dict = {k: v for v, k in enumerate(slot_types)}
	for mode in modes:
	slot_queries = get_slot_queries(infold, slot_dict, mode, intent_names)
	write_files(slot_queries, f'{outfold}/{mode}_slots.tsv')