NeMo

File size: 6,997 Bytes

7934b29

# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import itertools
import pickle as pkl
import random
from argparse import ArgumentParser

import pandas as pd
from omegaconf import OmegaConf
from tqdm import tqdm

# Info on these headers can be found here on the UMLS website https://www.ncbi.nlm.nih.gov/books/NBK9685/
# section 3.3.4 Table 1
HEADERS = [
    'CUI',
    'LAT',
    'TS',
    'LUI',
    'STT',
    'SUI',
    'ISPREF',
    'AUI',
    'SAUI',
    'SCUI',
    'SDUI',
    'SAB',
    'TTY',
    'CODE',
    'STR',
    'SRL',
    'SUPPRESS',
    'CVF',
]


def process_umls_training_dataset(data_path, train_save_name, val_save_name, max_pairs, train_split, headers):
    """
    Generates and saves UMLS self alignment pretraining train and validation data. Takes the raw .RRF UMLS 
    data file and creates different pair combinations for entities with the same CUI. Each row in the output
    will be formatted as 'CUI EntitySynonym1 EntitySynonym2' with each item in a row separated by tabs.
    Saves two .tsv output files, one for the train split and one for the validation split.
    Only data marked as English is added to the train and val splits. 

    Arguments:
        data_path (str): path to MRCONSO.RRF UMLS data file
        train_save_name (str): path to where training data will be saved
        val_save_name (str): path to where validation data will be saved
        max_pairs (int): max number of pairs for any one CUI added to the train 
                   or validation splits
        train_split (float): precentage of raw data to be added to train set split
        headers (list): column lables within MRCONSO.RRF
    """

    print("Loading training data file...")
    df = pd.read_table(data_path, names=headers, index_col=False, delimiter='|')
    train_file = open(train_save_name, 'w')
    val_file = open(val_save_name, 'w')

    cui = df["CUI"].iloc[0]
    names = []
    random.seed(2021)

    for idx in tqdm(range(len(df))):
        # Address incorrectly formatted data
        if type(df["STR"].iloc[idx]) != str or "|" in df["STR"].iloc[idx]:
            continue

        # Collect all english concept strings matching the current CUI
        if df["CUI"].iloc[idx] == cui and df["LAT"].iloc[idx] == "ENG":
            concept_string = df["STR"].iloc[idx]
            names.append(concept_string)

        else:
            # Pair off concept synonyms to make training and val sets
            pairs = list(itertools.combinations(names, 2))

            if len(pairs) == 0:
                # Not enough concepts gathered to make a pair
                cui = df["CUI"].iloc[idx]
                names = [df["STR"].iloc[idx]]
                continue

            # Removing leading C to convert label string to int
            cui = int(cui[1:])
            random.shuffle(pairs)

            # Keep up to max pairs number pairs for any one concept
            for pair in pairs[:max_pairs]:

                # Want concepts in train and val splits to be randomly selected and mutually exclusive
                add_to_train = random.random()

                if add_to_train <= train_split:
                    train_file.write(f'{cui}\t{pair[0]}\t{pair[1]}\n')
                else:
                    val_file.write(f'{cui}\t{pair[0]}\t{pair[1]}\n')

            # Switch to next concept
            cui = df["CUI"].iloc[idx]
            names = [df["STR"].iloc[idx]]

    train_file.close()
    val_file.close()
    print("Finished making training and validation data")


def process_umls_index_dataset(data_path, data_savename, id2string_savename, headers):
    """
    Generates data file needed to build a UMLS index and a hash table mapping each
    CUI to one canonical concept string. Takes the raw .RRF data file and saves 
    a .tsv indec concept file as well as the a .pkl file of cui to concept string 
    mappings. Only data marked as English is added to the index data file. 

    Arguments:
        data_path (str): path to MRCONSO.RRF UMLS data file
        data_savename (str): path to where .tsv index data will be saved
        id2string_savename (str): path to where .pkl cui to string mapping will
                                  be saved
        headers (list): column lables within MRCONSO.RRF
    """

    print("Loading index data file...")
    df = pd.read_table(data_path, names=headers, index_col=False, delimiter='|')
    id2string = {}

    with open(data_savename, "w") as outfile:
        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
            # Address incorrectly formatted data
            if type(row["STR"]) != str or "|" in row["STR"]:
                continue

            cui = row["CUI"]
            sent = row["STR"]

            # Removing leading C to convert label string to int
            cui = int(cui[1:])

            # Only keeping english concepts
            if row["LAT"] == "ENG":
                outfile.write(f'{cui}\t{sent}\n')

                # Matching each cui to one canonical string represention
                if cui not in id2string and ":" not in sent:
                    id2string[cui] = sent

    outfile.close()
    pkl.dump(id2string, open(id2string_savename, "wb"))
    print("Finished saving index data and id to concept mapping")


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument("--index", action="store_true", help="Whether to process data for building an index")
    parser.add_argument("--project_dir", required=False, type=str, default=".")
    parser.add_argument("--cfg", required=False, type=str, default="conf/umls_medical_entity_linking_config.yaml")
    parser.add_argument(
        "--max_pairs", required=False, type=int, default=50, help="Max number of train pairs for a single concepts"
    )
    parser.add_argument(
        "--train_split", required=False, type=float, default=0.99, help="Precentage of data to add to train set"
    )

    args = parser.parse_args()
    cfg = OmegaConf.load(args.cfg)
    cfg.project_dir = args.project_dir

    if args.index:
        process_umls_index_dataset(cfg.index.raw_data, cfg.index.index_ds.data_file, cfg.index.id_to_string, HEADERS)
    else:
        process_umls_training_dataset(
            cfg.model.raw_data,
            cfg.model.train_ds.data_file,
            cfg.model.validation_ds.data_file,
            args.max_pairs,
            args.train_split,
            HEADERS,
        )