|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
import re |
|
import shutil |
|
|
|
from nemo.collections.nlp.data.data_utils.data_preprocessing import DATABASE_EXISTS_TMP, if_exist, write_files |
|
from nemo.utils import logging |
|
|
|
|
|
def copy_input_files(infold): |
|
""" |
|
Put training files in convenient place for conversion to our format. |
|
|
|
Args: |
|
infold: location of an original fold of the dataset (in the sense of k-fold cross validation) |
|
""" |
|
our_infold = infold + "/dataset" |
|
|
|
if os.path.exists(our_infold + "/trainset") and os.path.exists(our_infold + "/testset"): |
|
logging.info("Input folders exists") |
|
return |
|
|
|
logging.info(f"Copying files to input folder: {our_infold}") |
|
os.makedirs(infold, exist_ok=True) |
|
|
|
old_infold = ( |
|
infold + '/CrossValidation/autoGeneFromRealAnno/autoGene_2018_03_22-13_01_25_169/CrossValidation/KFold_1' |
|
) |
|
if not os.path.exists(our_infold + "/trainset"): |
|
shutil.copytree(old_infold + '/trainset', our_infold + '/trainset') |
|
|
|
if not os.path.exists(our_infold + "/testset"): |
|
shutil.copytree(old_infold + '/testset/csv', our_infold + '/testset') |
|
|
|
|
|
def get_intents(infold): |
|
""" Get list of intents from file names. """ |
|
intents = [f[:-4] for f in os.listdir(infold)] |
|
intents.sort() |
|
logging.info(f'Found {len(intents)} intents') |
|
|
|
return intents |
|
|
|
|
|
def get_intent_queries(infold, intent_names, mode): |
|
""" Get list of queries with their corresponding intent number. """ |
|
intent_queries = ['sentence\tlabel\n'] |
|
|
|
for index, intent in enumerate(intent_names): |
|
queries = open(f'{infold}/{mode}set/{intent}.csv', 'r', encoding='utf-8').readlines() |
|
for query in queries[1:]: |
|
phrases = query.split(";") |
|
intent_query = phrases[4][1:-1] + "\t" + str(index) |
|
intent_queries.append(intent_query) |
|
|
|
return intent_queries |
|
|
|
|
|
def get_slots(infold, modes): |
|
""" |
|
Find a list of unique slot types in training and testing data. |
|
We use a single slot type name both for starting and continuation tokens (not using B-, I- notation). |
|
""" |
|
slots = set() |
|
|
|
for mode in modes: |
|
path = f'{infold}/{mode}set' |
|
for filename in os.listdir(path): |
|
lines = open(f'{path}/{filename}', 'r', encoding='utf-8').readlines() |
|
for line in lines[1:]: |
|
query = line.split(";")[3] |
|
slot_phrases = re.findall('\[.*?\]', query) |
|
for slot_phrase in slot_phrases: |
|
slot = slot_phrase.split(" : ")[0][1:] |
|
slots.add(slot) |
|
|
|
slots = sorted(slots) |
|
slots.append("O") |
|
logging.info(f'Found {len(slots)} slot types') |
|
|
|
return slots |
|
|
|
|
|
def get_slot_queries(infold, slot_dict, mode, intent_names): |
|
""" |
|
Convert each word in a query to corresponding slot number. |
|
Args: |
|
infold: fold of the data |
|
slot_dict: dict containing slot-names to positions |
|
mode: train, validation or test |
|
intent_names: list of intents |
|
""" |
|
slot_queries = [] |
|
outside_slot = len(slot_dict) - 1 |
|
|
|
|
|
for intent in intent_names: |
|
lines = open(f'{infold}/{mode}set/{intent}.csv', 'r', encoding='utf-8').readlines() |
|
for line in lines[1:]: |
|
slot_query = "" |
|
query = line.split(";")[3] |
|
words = query.split(" ") |
|
current_slot = outside_slot |
|
for word in words: |
|
if word[0] == "[": |
|
current_slot = slot_dict[word[1:]] |
|
elif word[0] == ":": |
|
continue |
|
else: |
|
slot_query += str(current_slot) + " " |
|
if word[-1] == ']': |
|
current_slot = outside_slot |
|
|
|
slot_queries.append(slot_query.strip()) |
|
|
|
return slot_queries |
|
|
|
|
|
def process_assistant(infold, outfold, modes=['train', 'test']): |
|
""" |
|
https://github.com/xliuhw/NLU-Evaluation-Data - this dataset includes |
|
about 25 thousand examples with 66 various multi-domain intents and 57 entity types. |
|
""" |
|
if if_exist(outfold, [f'{mode}_slots.tsv' for mode in modes]): |
|
logging.info(DATABASE_EXISTS_TMP.format('robot', outfold)) |
|
return outfold |
|
|
|
logging.info(f'Processing assistant commands dataset and store at {outfold}') |
|
os.makedirs(outfold, exist_ok=True) |
|
|
|
|
|
copy_input_files(infold) |
|
infold += "/dataset" |
|
|
|
|
|
intent_names = get_intents(infold + "/trainset") |
|
write_files(intent_names, f'{outfold}/dict.intents.csv') |
|
|
|
|
|
for mode in modes: |
|
intent_queries = get_intent_queries(infold, intent_names, mode) |
|
write_files(intent_queries, f'{outfold}/{mode}.tsv') |
|
|
|
|
|
slot_types = get_slots(infold, modes) |
|
write_files(slot_types, f'{outfold}/dict.slots.csv') |
|
|
|
|
|
slot_dict = {k: v for v, k in enumerate(slot_types)} |
|
for mode in modes: |
|
slot_queries = get_slot_queries(infold, slot_dict, mode, intent_names) |
|
write_files(slot_queries, f'{outfold}/{mode}_slots.tsv') |
|
|