import os import shutil from copy import deepcopy import numpy as np from config import ( BART, BATCH_SIZE, HUMAN_LABEL, LEARNING_RATES, MACHINE_LABEL, MODEL_NAME, MULTIMODEL, NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING, OPTIMIZED_METRIC, PATIENCE, ROBERTA_MODEL_PATHS, SINGLE_FROM_MULTIMODEL, TRAIN_RATIO, VAL_RATIO, tokenizer, ) from datasets import Dataset from sklearn.base import accuracy_score from sklearn.metrics import roc_auc_score from sklearn.neural_network import MLPClassifier from transformers import ( AutoModelForSequenceClassification, DataCollatorWithPadding, EarlyStoppingCallback, Trainer, TrainerCallback, TrainingArguments, ) from texts.bart_score import ( bart_score_in_batch, extract_feature_in_batch, ) from texts.config import OUTPUT_FILE from texts.evaluation import compute_metrics from texts.utils import ( check_error, combine_text_with_BERT_format, parse_multimodal_data, write_to_file, ) class TextDetector: def __init__(self) -> None: self.model = None self.multimodel = None self.train_data = None self.val_data = None self.test_data = None self.train_features = None self.val_features = None self.test_features def text_analysis(text: str) -> float: score = 0.0 return score class CustomCallback(TrainerCallback): """ Custom callback to evaluate the training dataset at the end of each epoch. """ def __init__(self, trainer) -> None: super().__init__() self._trainer = trainer def on_epoch_end(self, args, state, control, **kwargs): """ At the end of each epoch, evaluate the training dataset. """ if control.should_evaluate: control_copy = deepcopy(control) self._trainer.evaluate( eval_dataset=self._trainer.train_dataset, metric_key_prefix="train", ) return control_copy def abstract_train(features, labels): """ Trains a model using the given features and labels. Args: features (list): The input features for training. labels (list): The target labels for training. Returns: object: The trained model. """ model = MLPClassifier() model.fit(features, labels) return model def evaluate_model(model, features, labels): """ Evaluates the model's performance using accuracy and ROC AUC scores. Args: model (object): The trained model to evaluate. features (list): The input features for evaluation. labels (list): The target labels for evaluation. Returns: None """ predictions = model.predict(features) rounded_predictions = [round(value) for value in predictions] accuracy = accuracy_score(labels, rounded_predictions) write_to_file(OUTPUT_FILE, f"Accuracy: {accuracy * 100.0:.1f}%\n") roc_auc = roc_auc_score(labels, rounded_predictions) write_to_file(OUTPUT_FILE, f"ROC AUC: {roc_auc * 100.0:.1f}%\n") def preprocess_function_multimodel(sample): """ Preprocesses a given sample for a multi-model setup by calculating BART scores and formatting the text for BERT input. Args: sample (dict): A dictionary containing a key "text", which is a list of lists of strings. Returns: dict: A dictionary containing tokenized and preprocessed text data. """ num_texts = len(sample["text"][0]) # Number of texts in each sub-sample texts_grouped_by_index = [ [] for _ in range(num_texts) ] # Initialize empty lists for grouping texts by index # Group texts by their index across sub-samples for sub_sample in sample["text"]: for i in range(num_texts): texts_grouped_by_index[i].append(sub_sample[i]) # Calculate BART scores for each text pair (text[0] with text[i]) bart_scores = [ bart_score_in_batch( texts_grouped_by_index[0], texts_grouped_by_index[i], ) for i in range(1, num_texts) ] combined_texts = [] # Process each sub-sample for BERT input for index, sub_sample in enumerate(sample["text"]): text_array = [sub_sample[0]] # Start with the input text score_generation_pairs = [] # Pair scores with their corresponding generations for i in range(1, num_texts): generation_text = sub_sample[i] generation_score = bart_scores[i - 1][index] score_generation_pairs.append((generation_score, generation_text)) # Sort pairs by score in descending order sorted_pairs = sorted(score_generation_pairs, reverse=True) # Append sorted texts to text_array for _, sorted_text in sorted_pairs: text_array.append(sorted_text) # Combine texts into a single BERT-formatted string combined_text = combine_text_with_BERT_format(text_array) combined_texts.append(combined_text) # Tokenize the combined texts for BERT return tokenizer(combined_texts, add_special_tokens=False, truncation=True) def preprocess_function_single_from_multimodel(sample): """ Extracts the first text from each sub-sample in a multi-model sample and tokenizes it. Args: sample (dict): A dictionary containing a key "text", which is a list of lists of strings. Returns: dict: A dictionary containing tokenized text data. """ combined_texts = [] # Iterate through each sub-sample for sub_sample in sample["text"]: input_text = sub_sample[ 0 ] # Extract the first text from the sub-sample combined_texts.append( input_text, ) # Append it to the list of combined texts # Tokenize the combined texts return tokenizer(combined_texts, truncation=True) def train_only_by_transformer_with_test_evaluation_early_stop( train_data, test_data, input_type, num_classes=2, ): """ Trains a transformer model using the provided training and testing datasets with early stopping. Args: train_data (Dataset): The training dataset. test_data (Dataset): The testing dataset. input_type (str): The type of input data, either MULTIMODEL or SINGLE_FROM_MULTIMODEL. num_classes (int, optional): The number of classes for classification. Defaults to 2. Returns: Trainer: The trained model wrapped in a Trainer object. """ # Preprocess datasets based on the input type if input_type == MULTIMODEL: train_data = train_data.map( preprocess_function_multimodel, batched=True, ) test_data = test_data.map(preprocess_function_multimodel, batched=True) elif input_type == SINGLE_FROM_MULTIMODEL: train_data = train_data.map( preprocess_function_single_from_multimodel, batched=True, ) test_data = test_data.map( preprocess_function_single_from_multimodel, batched=True, ) # Data collator to pad inputs data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Load appropriate model based on number of classes if num_classes == 3: model = AutoModelForSequenceClassification.from_pretrained( "pretrained_model/roberta-base_num_labels_3", num_labels=num_classes, ) else: model = AutoModelForSequenceClassification.from_pretrained( ROBERTA_MODEL_PATHS[MODEL_NAME], num_labels=num_classes, ) learning_rate = LEARNING_RATES[MODEL_NAME] output_folder = "training_with_callbacks" # Remove the output folder if it already exists if os.path.exists(output_folder): shutil.rmtree(output_folder) # Training arguments training_args = TrainingArguments( output_dir=output_folder, evaluation_strategy="epoch", logging_strategy="epoch", save_strategy="epoch", learning_rate=learning_rate, per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE, num_train_epochs=NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING, weight_decay=0.01, push_to_hub=False, metric_for_best_model=OPTIMIZED_METRIC, load_best_model_at_end=True, ) # Create Trainer object trainer = Trainer( model=model, args=training_args, train_dataset=train_data, eval_dataset=test_data, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)], ) # Add custom callback trainer.add_callback(CustomCallback(trainer)) # Start training trainer.train() return trainer def create_pair_sample(data_item, training_indices): """ Creates pair samples for training by comparing human data with machine-generated data. Args: data_item (dict): A dictionary containing 'human', 'single', and 'pair' data. training_indices (list): A list of indices used for training. Returns: list: A list of dictionaries, each containing a 'text' array and a 'label'. """ # Initialize the result list result_samples = [] # Check if there is any error in the data_item if check_error(data_item): return result_samples # Create machine samples for train_idx in training_indices: if data_item["human"] != data_item["single"][train_idx]: text_array = [] machine_text = data_item["single"][train_idx] text_array.append(machine_text) for sub_idx in training_indices: text_array.append(data_item["pair"][train_idx][sub_idx]) sample = { "text": text_array, "label": MACHINE_LABEL, } result_samples.append(sample) # Create human samples text_array = [data_item["human"]] for train_idx in training_indices: text_array.append(data_item["single"][train_idx]) human_sample = { "text": text_array, "label": HUMAN_LABEL, } # Append human samples for each machine sample num_machine_samples = len(result_samples) for _ in range(num_machine_samples): result_samples.append(human_sample) return result_samples def create_pair_test_sample(data_item, training_indices, testing_indices): """ Creates pair test samples by comparing human data with machine-generated data. Args: data_item (dict): A dictionary containing 'human', 'single', and 'pair' data. training_indices (list): A list of indices used for training. testing_indices (list): A list of indices used for testing. Returns: list: A list of dictionaries, each containing a 'text' array and a 'label'. """ # Initialize the result list result_samples = [] # Check if there is any error in the data_item if check_error(data_item): return result_samples # Create machine samples based on testing indices for test_idx in testing_indices: if data_item["human"] != data_item["single"][test_idx]: text_array = [] machine_text = data_item["single"][test_idx] text_array.append(machine_text) for train_idx in training_indices: text_array.append(data_item["pair"][test_idx][train_idx]) sample = { "text": text_array, "label": MACHINE_LABEL, } result_samples.append(sample) # Create human sample text_array = [data_item["human"]] for train_idx in training_indices: text_array.append(data_item["single"][train_idx]) human_sample = { "text": text_array, "label": HUMAN_LABEL, } # Append the human sample for each machine sample num_machine_samples = len(result_samples) for _ in range(num_machine_samples): result_samples.append(human_sample) return result_samples def create_train_val_sample(data, training_indices): """ Creates training and validation samples from the provided data. Args: data (list): A list of data items, each to be processed. training_indices (list): A list of indices used for training. Returns: list: A list of training and validation samples created from the data. """ # Initialize the result list result_samples = [] # Process each item in the data for data_item in data: # Create pair samples for the current item sub_samples = create_pair_sample(data_item, training_indices) # Extend the result list with the created sub-samples result_samples.extend(sub_samples) return result_samples def create_test_sample(data, training_indices, testing_indices): """ Creates test samples from the provided data by comparing human data with machine-generated data. Args: data (list): A list of data items, each to be processed. training_indices (list): A list of indices used for training. testing_indices (list): A list of indices used for testing. Returns: list: A list of test samples created from the data. """ # Initialize the result list result_samples = [] # Process each item in the data for data_item in data: # Create pair test samples for the current item sub_samples = create_pair_test_sample( data_item, training_indices, testing_indices, ) # Extend the result list with the created sub-samples result_samples.extend(sub_samples) return result_samples def distribute_data(data, train_indices, test_indices, train_ratio, val_ratio): """ Distributes the data into training, validation, and test samples. Args: data (list): A list of data items to be split and processed. train_indices (list): A list of indices used for training. test_indices (list): A list of indices used for testing. train_ratio (float): The ratio of data to be used for training. val_ratio (float): The ratio of data to be used for validation. Returns: tuple: A tuple containing lists of training, validation, and test samples. """ # Split the data into training, validation, and test sets train_data, val_data, test_data = split_train_val_test( data, train_ratio, val_ratio, ) # Create training samples train_samples = create_train_val_sample(train_data, train_indices) write_to_file(OUTPUT_FILE, f"train samples = {len(train_samples)}\n") # Create validation samples val_samples = create_train_val_sample(val_data, train_indices) write_to_file(OUTPUT_FILE, f"val samples = {len(val_samples)}\n") # Create test samples test_samples = create_test_sample(test_data, train_indices, test_indices) write_to_file(OUTPUT_FILE, f"test samples = {len(test_samples)}\n") return train_samples, val_samples, test_samples def convert_to_huggingface_with_multimodel(samples): """ Converts a list of samples to the Hugging Face Dataset format. Args: samples (list): A list of samples to be converted. Returns: Dataset: A Hugging Face Dataset object created from the samples. """ return Dataset.from_list(samples) def train_by_transformer_with_multimodel_and_early_stop( train_samples, val_samples, input_type, ): """ Trains a transformer model with multimodal data and early stopping. Args: train_samples (list): A list of training samples. val_samples (list): A list of validation samples. input_type (str): The type of input data (e.g., multimodal). Returns: object: The trained model with early stopping. """ # Convert training and validation samples to Hugging Face Dataset format train_data = convert_to_huggingface_with_multimodel(train_samples) val_data = convert_to_huggingface_with_multimodel(val_samples) # Train the model with early stopping and return the trained model return train_only_by_transformer_with_test_evaluation_early_stop( train_data, val_data, input_type, ) def test_by_transformer_with_multimodel(detector, test_samples, input_type): """ Tests a trained transformer model with multimodal data. Args: detector (object): The trained model to be evaluated. test_samples (list): A list of test samples. input_type (str): The type of input data (e.g., multimodal). Returns: None """ # Convert test samples to Hugging Face Dataset format test_data = convert_to_huggingface_with_multimodel(test_samples) # Apply the appropriate preprocessing function based on the input type if input_type == MULTIMODEL: test_data = test_data.map(preprocess_function_multimodel, batched=True) elif input_type == SINGLE_FROM_MULTIMODEL: test_data = test_data.map( preprocess_function_single_from_multimodel, batched=True, ) # Evaluate the model on the test data result = detector.evaluate(eval_dataset=test_data) # Extract and log the ROC AUC score roc_auc = result["eval_roc_auc"] write_to_file(OUTPUT_FILE, "roc_auc: %.1f%%" % (roc_auc * 100.0) + "\n") def extract_by_feature_kind(samples, feature_type): """ Extracts features from the given samples based on the specified feature type. Args: samples (list): A list of samples where each sample is a dictionary with 'text' and 'label' keys. feature_type (str): The type of feature to extract. Returns: tuple: A tuple containing the extracted features and corresponding labels. """ text_1_list = [] text_2_list = [] labels = [] for sample in samples: text_1_list.append(sample["text"][0]) text_2_list.append(sample["text"][1]) labels.append(sample["label"]) # Extract features in batch based on the feature type features = extract_feature_in_batch(text_1_list, text_2_list, feature_type) return features, labels def train_by_feature_kind(train_samples, feature_type): """ Trains a model using features extracted from the training samples based on the specified feature type. Args: train_samples (list): A list of training samples where each sample is a dictionary with 'text' and 'label' keys. feature_type (str): The type of feature to extract for training. Returns: object: The trained model. """ # Extract features and labels from the training samples features, labels = extract_by_feature_kind(train_samples, feature_type) # Convert features to a numpy array and reshape for training features = np.array(features) features = features.reshape(-1, 1) # Train the model using the extracted features and labels model = abstract_train(features, labels) return model def test_by_feature_kind(detector, samples, feature_type): """ Tests a detector using features extracted from the provided samples based on the specified feature type. Args: detector (object): The detector model to be evaluated. samples (list): A list of samples where each sample is a dictionary with 'text' and 'label' keys. feature_type (str): The type of feature to extract for testing. Returns: None """ # Extract features and labels from the samples features, labels = extract_by_feature_kind(samples, feature_type) # Convert features to a numpy array and reshape for evaluation features = np.array(features) features = features.reshape(-1, 1) # Evaluate the detector model using the extracted features and labels evaluate_model(detector, features, labels) def general_process_multimodels_train_val_test( train_samples, val_samples, test_samples, ): """ General process for training, validating, and testing models using multi-model and feature kind approaches. Args: train_samples (list): Training samples. val_samples (list): Validation samples. test_samples (list): Test samples. Returns: None """ # Multi-model approach input_kind = MULTIMODEL write_to_file(OUTPUT_FILE, "\nInput kind = {input_kind} \n") # Train detector using multi-model with early stopping detector = train_by_transformer_with_multimodel_and_early_stop( train_samples, val_samples, input_kind, ) # Evaluate on train set write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n") test_by_transformer_with_multimodel(detector, train_samples, input_kind) # Evaluate on validation set write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n") test_by_transformer_with_multimodel(detector, val_samples, input_kind) # Evaluate on test set write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n") test_by_transformer_with_multimodel(detector, test_samples, input_kind) # Single from multi-model approach input_kind = SINGLE_FROM_MULTIMODEL write_to_file(OUTPUT_FILE, "\nInput kind = {input_kind} \n") # Train detector using single from multi-model with early stopping detector = train_by_transformer_with_multimodel_and_early_stop( train_samples, val_samples, input_kind, ) # Evaluate on train set write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n") test_by_transformer_with_multimodel(detector, train_samples, input_kind) # Evaluate on validation set write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n") test_by_transformer_with_multimodel(detector, val_samples, input_kind) # Evaluate on test set write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n") test_by_transformer_with_multimodel(detector, test_samples, input_kind) # Feature kind approach sample_length = len(train_samples[0]["text"]) if ( sample_length == 2 ): # Check if the sample length is 2, indicating BART feature kind feature_kind = BART write_to_file(OUTPUT_FILE, "\nFeature kind = {feature_kind} \n") # Train detector using feature kind detector = train_by_feature_kind(train_samples, feature_kind) # Evaluate on train set write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n") test_by_feature_kind(detector, train_samples, feature_kind) # Evaluate on validation set write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n") test_by_feature_kind(detector, val_samples, feature_kind) # Evaluate on test set write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n") test_by_feature_kind(detector, test_samples, feature_kind) def process_multi_models_with_validation( multimodel_csv_file, train_indices, test_indices, num_samples, ): """ Processes multi-model data with validation, training, and testing. Args: multimodel_csv_file (str): Path to the CSV file containing multi-model data. train_indices (list): Indices for the training data. test_indices (list): Indices for the testing data. num_samples (int): Number of samples to process. Returns: None """ # Log the details of the process write_to_file(OUTPUT_FILE, f"PROCESSING FILE={multimodel_csv_file} \n") write_to_file(OUTPUT_FILE, f"EXPERIMENT WITH {MODEL_NAME} model \n") write_to_file( OUTPUT_FILE, f"NUMBER OF MAX EPOCHS WITH EARLY STOPPING =\ {NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING} \n", ) write_to_file(OUTPUT_FILE, f"PATIENCE = {PATIENCE} \n") write_to_file(OUTPUT_FILE, f"OPTIMIZED METRIC = {OPTIMIZED_METRIC} \n") write_to_file(OUTPUT_FILE, f"BATCH SIZE = {BATCH_SIZE} \n") write_to_file(OUTPUT_FILE, f"Number of samples = {num_samples} \n") # Read multi-model data from the CSV file data = parse_multimodal_data(multimodel_csv_file) # Limit data to the specified number of samples data = data[:num_samples] # Distribute data into training, validation, and testing sets train_samples, val_samples, test_samples = distribute_data( data, train_indices, test_indices, TRAIN_RATIO, VAL_RATIO, ) # Log the training and testing indices write_to_file( OUTPUT_FILE, f"Multimodel training with train indices {train_indices},\ test with test indices {test_indices} \n", ) # Process the multi-models for training, validation, and testing general_process_multimodels_train_val_test( train_samples, val_samples, test_samples, ) def split_train_val_test(data, train_ratio, val_ratio): """ Splits the dataset into training, validation, and test sets based on specified ratios. Args: data (list): The dataset to be split. train_ratio (float): The ratio of the dataset to be used for training. val_ratio (float): The ratio of the dataset to be used for validation. Returns: tuple: A tuple containing three lists (train_data, val_data, test_data). """ # Calculate the number of samples for the training set num_train_samples = int(len(data) * train_ratio) # Calculate the number of samples for the validation set num_val_samples = int(len(data) * val_ratio) # Split the data into training, validation, and test sets train_data = data[:num_train_samples] val_data = data[num_train_samples : (num_train_samples + num_val_samples)] test_data = data[(num_train_samples + num_val_samples) :] return train_data, val_data, test_data