Spaces:

pmkhanh7890
/

news_verification

Sleeping

File size: 26,343 Bytes

22e1b62

import os
import shutil
from copy import deepcopy

import numpy as np
from config import (
    BART,
    BATCH_SIZE,
    HUMAN_LABEL,
    LEARNING_RATES,
    MACHINE_LABEL,
    MODEL_NAME,
    MULTIMODEL,
    NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING,
    OPTIMIZED_METRIC,
    PATIENCE,
    ROBERTA_MODEL_PATHS,
    SINGLE_FROM_MULTIMODEL,
    TRAIN_RATIO,
    VAL_RATIO,
    tokenizer,
)
from datasets import Dataset
from sklearn.base import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier
from transformers import (
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    Trainer,
    TrainerCallback,
    TrainingArguments,
)

from texts.bart_score import (
    bart_score_in_batch,
    extract_feature_in_batch,
)
from texts.config import OUTPUT_FILE
from texts.evaluation import compute_metrics
from texts.utils import (
    check_error,
    combine_text_with_BERT_format,
    parse_multimodal_data,
    write_to_file,
)


class TextDetector:
    def __init__(self) -> None:
        self.model = None
        self.multimodel = None
        self.train_data = None
        self.val_data = None
        self.test_data = None
        self.train_features = None
        self.val_features = None
        self.test_features

    def text_analysis(text: str) -> float:
        score = 0.0
        return score


class CustomCallback(TrainerCallback):
    """
    Custom callback to evaluate the training dataset at the end of each epoch.
    """

    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer

    def on_epoch_end(self, args, state, control, **kwargs):
        """
        At the end of each epoch, evaluate the training dataset.
        """
        if control.should_evaluate:
            control_copy = deepcopy(control)
            self._trainer.evaluate(
                eval_dataset=self._trainer.train_dataset,
                metric_key_prefix="train",
            )
            return control_copy


def abstract_train(features, labels):
    """
    Trains a model using the given features and labels.

    Args:
        features (list): The input features for training.
        labels (list): The target labels for training.

    Returns:
        object: The trained model.
    """
    model = MLPClassifier()
    model.fit(features, labels)
    return model


def evaluate_model(model, features, labels):
    """
    Evaluates the model's performance using accuracy and ROC AUC scores.

    Args:
        model (object): The trained model to evaluate.
        features (list): The input features for evaluation.
        labels (list): The target labels for evaluation.

    Returns:
        None
    """
    predictions = model.predict(features)
    rounded_predictions = [round(value) for value in predictions]

    accuracy = accuracy_score(labels, rounded_predictions)
    write_to_file(OUTPUT_FILE, f"Accuracy: {accuracy * 100.0:.1f}%\n")

    roc_auc = roc_auc_score(labels, rounded_predictions)
    write_to_file(OUTPUT_FILE, f"ROC AUC: {roc_auc * 100.0:.1f}%\n")


def preprocess_function_multimodel(sample):
    """
    Preprocesses a given sample for a multi-model setup by calculating
        BART scores and formatting the text for BERT input.

    Args:
        sample (dict): A dictionary containing a key "text", which is a list of
            lists of strings.

    Returns:
        dict: A dictionary containing tokenized and preprocessed text data.
    """
    num_texts = len(sample["text"][0])  # Number of texts in each sub-sample
    texts_grouped_by_index = [
        [] for _ in range(num_texts)
    ]  # Initialize empty lists for grouping texts by index

    # Group texts by their index across sub-samples
    for sub_sample in sample["text"]:
        for i in range(num_texts):
            texts_grouped_by_index[i].append(sub_sample[i])

    # Calculate BART scores for each text pair (text[0] with text[i])
    bart_scores = [
        bart_score_in_batch(
            texts_grouped_by_index[0],
            texts_grouped_by_index[i],
        )
        for i in range(1, num_texts)
    ]

    combined_texts = []

    # Process each sub-sample for BERT input
    for index, sub_sample in enumerate(sample["text"]):
        text_array = [sub_sample[0]]  # Start with the input text
        score_generation_pairs = []

        # Pair scores with their corresponding generations
        for i in range(1, num_texts):
            generation_text = sub_sample[i]
            generation_score = bart_scores[i - 1][index]
            score_generation_pairs.append((generation_score, generation_text))

        # Sort pairs by score in descending order
        sorted_pairs = sorted(score_generation_pairs, reverse=True)

        # Append sorted texts to text_array
        for _, sorted_text in sorted_pairs:
            text_array.append(sorted_text)

        # Combine texts into a single BERT-formatted string
        combined_text = combine_text_with_BERT_format(text_array)
        combined_texts.append(combined_text)

    # Tokenize the combined texts for BERT
    return tokenizer(combined_texts, add_special_tokens=False, truncation=True)


def preprocess_function_single_from_multimodel(sample):
    """
    Extracts the first text from each sub-sample in a multi-model sample and
        tokenizes it.

    Args:
        sample (dict): A dictionary containing a key "text", which is a list of
            lists of strings.

    Returns:
        dict: A dictionary containing tokenized text data.
    """
    combined_texts = []

    # Iterate through each sub-sample
    for sub_sample in sample["text"]:
        input_text = sub_sample[
            0
        ]  # Extract the first text from the sub-sample
        combined_texts.append(
            input_text,
        )  # Append it to the list of combined texts

    # Tokenize the combined texts
    return tokenizer(combined_texts, truncation=True)


def train_only_by_transformer_with_test_evaluation_early_stop(
    train_data,
    test_data,
    input_type,
    num_classes=2,
):
    """
    Trains a transformer model using the provided training and testing
        datasets with early stopping.

    Args:
        train_data (Dataset): The training dataset.
        test_data (Dataset): The testing dataset.
        input_type (str): The type of input data, either MULTIMODEL or
            SINGLE_FROM_MULTIMODEL.
        num_classes (int, optional): The number of classes for classification.
            Defaults to 2.

    Returns:
        Trainer: The trained model wrapped in a Trainer object.
    """
    # Preprocess datasets based on the input type
    if input_type == MULTIMODEL:
        train_data = train_data.map(
            preprocess_function_multimodel,
            batched=True,
        )
        test_data = test_data.map(preprocess_function_multimodel, batched=True)
    elif input_type == SINGLE_FROM_MULTIMODEL:
        train_data = train_data.map(
            preprocess_function_single_from_multimodel,
            batched=True,
        )
        test_data = test_data.map(
            preprocess_function_single_from_multimodel,
            batched=True,
        )

    # Data collator to pad inputs
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Load appropriate model based on number of classes
    if num_classes == 3:
        model = AutoModelForSequenceClassification.from_pretrained(
            "pretrained_model/roberta-base_num_labels_3",
            num_labels=num_classes,
        )
    else:
        model = AutoModelForSequenceClassification.from_pretrained(
            ROBERTA_MODEL_PATHS[MODEL_NAME],
            num_labels=num_classes,
        )

    learning_rate = LEARNING_RATES[MODEL_NAME]
    output_folder = "training_with_callbacks"

    # Remove the output folder if it already exists
    if os.path.exists(output_folder):
        shutil.rmtree(output_folder)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_folder,
        evaluation_strategy="epoch",
        logging_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING,
        weight_decay=0.01,
        push_to_hub=False,
        metric_for_best_model=OPTIMIZED_METRIC,
        load_best_model_at_end=True,
    )

    # Create Trainer object
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)],
    )

    # Add custom callback
    trainer.add_callback(CustomCallback(trainer))

    # Start training
    trainer.train()

    return trainer


def create_pair_sample(data_item, training_indices):
    """
    Creates pair samples for training by comparing human data with
        machine-generated data.

    Args:
        data_item (dict): A dictionary containing 'human', 'single',
            and 'pair' data.
        training_indices (list): A list of indices used for training.

    Returns:
        list: A list of dictionaries, each containing a 'text' array
            and a 'label'.
    """
    # Initialize the result list
    result_samples = []

    # Check if there is any error in the data_item
    if check_error(data_item):
        return result_samples

    # Create machine samples
    for train_idx in training_indices:
        if data_item["human"] != data_item["single"][train_idx]:
            text_array = []
            machine_text = data_item["single"][train_idx]
            text_array.append(machine_text)

            for sub_idx in training_indices:
                text_array.append(data_item["pair"][train_idx][sub_idx])

            sample = {
                "text": text_array,
                "label": MACHINE_LABEL,
            }
            result_samples.append(sample)

    # Create human samples
    text_array = [data_item["human"]]

    for train_idx in training_indices:
        text_array.append(data_item["single"][train_idx])

    human_sample = {
        "text": text_array,
        "label": HUMAN_LABEL,
    }

    # Append human samples for each machine sample
    num_machine_samples = len(result_samples)
    for _ in range(num_machine_samples):
        result_samples.append(human_sample)

    return result_samples


def create_pair_test_sample(data_item, training_indices, testing_indices):
    """
    Creates pair test samples by comparing human data with
        machine-generated data.

    Args:
        data_item (dict): A dictionary containing 'human', 'single', and
            'pair' data.
        training_indices (list): A list of indices used for training.
        testing_indices (list): A list of indices used for testing.

    Returns:
        list: A list of dictionaries, each containing a 'text' array and a
            'label'.
    """
    # Initialize the result list
    result_samples = []

    # Check if there is any error in the data_item
    if check_error(data_item):
        return result_samples

    # Create machine samples based on testing indices
    for test_idx in testing_indices:
        if data_item["human"] != data_item["single"][test_idx]:
            text_array = []
            machine_text = data_item["single"][test_idx]
            text_array.append(machine_text)

            for train_idx in training_indices:
                text_array.append(data_item["pair"][test_idx][train_idx])

            sample = {
                "text": text_array,
                "label": MACHINE_LABEL,
            }
            result_samples.append(sample)

    # Create human sample
    text_array = [data_item["human"]]

    for train_idx in training_indices:
        text_array.append(data_item["single"][train_idx])

    human_sample = {
        "text": text_array,
        "label": HUMAN_LABEL,
    }

    # Append the human sample for each machine sample
    num_machine_samples = len(result_samples)
    for _ in range(num_machine_samples):
        result_samples.append(human_sample)

    return result_samples


def create_train_val_sample(data, training_indices):
    """
    Creates training and validation samples from the provided data.

    Args:
        data (list): A list of data items, each to be processed.
        training_indices (list): A list of indices used for training.

    Returns:
        list: A list of training and validation samples created from the data.
    """
    # Initialize the result list
    result_samples = []

    # Process each item in the data
    for data_item in data:
        # Create pair samples for the current item
        sub_samples = create_pair_sample(data_item, training_indices)

        # Extend the result list with the created sub-samples
        result_samples.extend(sub_samples)

    return result_samples


def create_test_sample(data, training_indices, testing_indices):
    """
    Creates test samples from the provided data by comparing human data with
        machine-generated data.

    Args:
        data (list): A list of data items, each to be processed.
        training_indices (list): A list of indices used for training.
        testing_indices (list): A list of indices used for testing.

    Returns:
        list: A list of test samples created from the data.
    """
    # Initialize the result list
    result_samples = []

    # Process each item in the data
    for data_item in data:
        # Create pair test samples for the current item
        sub_samples = create_pair_test_sample(
            data_item,
            training_indices,
            testing_indices,
        )

        # Extend the result list with the created sub-samples
        result_samples.extend(sub_samples)

    return result_samples


def distribute_data(data, train_indices, test_indices, train_ratio, val_ratio):
    """
    Distributes the data into training, validation, and test samples.

    Args:
        data (list): A list of data items to be split and processed.
        train_indices (list): A list of indices used for training.
        test_indices (list): A list of indices used for testing.
        train_ratio (float): The ratio of data to be used for training.
        val_ratio (float): The ratio of data to be used for validation.

    Returns:
        tuple: A tuple containing lists of training, validation,
            and test samples.
    """
    # Split the data into training, validation, and test sets
    train_data, val_data, test_data = split_train_val_test(
        data,
        train_ratio,
        val_ratio,
    )

    # Create training samples
    train_samples = create_train_val_sample(train_data, train_indices)
    write_to_file(OUTPUT_FILE, f"train samples = {len(train_samples)}\n")

    # Create validation samples
    val_samples = create_train_val_sample(val_data, train_indices)
    write_to_file(OUTPUT_FILE, f"val samples = {len(val_samples)}\n")

    # Create test samples
    test_samples = create_test_sample(test_data, train_indices, test_indices)
    write_to_file(OUTPUT_FILE, f"test samples = {len(test_samples)}\n")

    return train_samples, val_samples, test_samples


def convert_to_huggingface_with_multimodel(samples):
    """
    Converts a list of samples to the Hugging Face Dataset format.

    Args:
        samples (list): A list of samples to be converted.

    Returns:
        Dataset: A Hugging Face Dataset object created from the samples.
    """
    return Dataset.from_list(samples)


def train_by_transformer_with_multimodel_and_early_stop(
    train_samples,
    val_samples,
    input_type,
):
    """
    Trains a transformer model with multimodal data and early stopping.

    Args:
        train_samples (list): A list of training samples.
        val_samples (list): A list of validation samples.
        input_type (str): The type of input data (e.g., multimodal).

    Returns:
        object: The trained model with early stopping.
    """
    # Convert training and validation samples to Hugging Face Dataset format
    train_data = convert_to_huggingface_with_multimodel(train_samples)
    val_data = convert_to_huggingface_with_multimodel(val_samples)

    # Train the model with early stopping and return the trained model
    return train_only_by_transformer_with_test_evaluation_early_stop(
        train_data,
        val_data,
        input_type,
    )


def test_by_transformer_with_multimodel(detector, test_samples, input_type):
    """
    Tests a trained transformer model with multimodal data.

    Args:
        detector (object): The trained model to be evaluated.
        test_samples (list): A list of test samples.
        input_type (str): The type of input data (e.g., multimodal).

    Returns:
        None
    """
    # Convert test samples to Hugging Face Dataset format
    test_data = convert_to_huggingface_with_multimodel(test_samples)

    # Apply the appropriate preprocessing function based on the input type
    if input_type == MULTIMODEL:
        test_data = test_data.map(preprocess_function_multimodel, batched=True)
    elif input_type == SINGLE_FROM_MULTIMODEL:
        test_data = test_data.map(
            preprocess_function_single_from_multimodel,
            batched=True,
        )

    # Evaluate the model on the test data
    result = detector.evaluate(eval_dataset=test_data)

    # Extract and log the ROC AUC score
    roc_auc = result["eval_roc_auc"]
    write_to_file(OUTPUT_FILE, "roc_auc: %.1f%%" % (roc_auc * 100.0) + "\n")


def extract_by_feature_kind(samples, feature_type):
    """
    Extracts features from the given samples based on the specified feature
        type.

    Args:
        samples (list): A list of samples where each sample is a dictionary
            with 'text' and 'label' keys.
        feature_type (str): The type of feature to extract.

    Returns:
        tuple: A tuple containing the extracted features and corresponding
            labels.
    """
    text_1_list = []
    text_2_list = []
    labels = []

    for sample in samples:
        text_1_list.append(sample["text"][0])
        text_2_list.append(sample["text"][1])
        labels.append(sample["label"])

    # Extract features in batch based on the feature type
    features = extract_feature_in_batch(text_1_list, text_2_list, feature_type)

    return features, labels


def train_by_feature_kind(train_samples, feature_type):
    """
    Trains a model using features extracted from the training samples based on
        the specified feature type.

    Args:
        train_samples (list): A list of training samples where each sample is
            a dictionary with 'text' and 'label' keys.
        feature_type (str): The type of feature to extract for training.

    Returns:
        object: The trained model.
    """
    # Extract features and labels from the training samples
    features, labels = extract_by_feature_kind(train_samples, feature_type)

    # Convert features to a numpy array and reshape for training
    features = np.array(features)
    features = features.reshape(-1, 1)

    # Train the model using the extracted features and labels
    model = abstract_train(features, labels)

    return model


def test_by_feature_kind(detector, samples, feature_type):
    """
    Tests a detector using features extracted from the provided samples based
        on the specified feature type.

    Args:
        detector (object): The detector model to be evaluated.
        samples (list): A list of samples where each sample is a dictionary
            with 'text' and 'label' keys.
        feature_type (str): The type of feature to extract for testing.

    Returns:
        None
    """
    # Extract features and labels from the samples
    features, labels = extract_by_feature_kind(samples, feature_type)

    # Convert features to a numpy array and reshape for evaluation
    features = np.array(features)
    features = features.reshape(-1, 1)

    # Evaluate the detector model using the extracted features and labels
    evaluate_model(detector, features, labels)


def general_process_multimodels_train_val_test(
    train_samples,
    val_samples,
    test_samples,
):
    """
    General process for training, validating, and testing models using
        multi-model and feature kind approaches.

    Args:
        train_samples (list): Training samples.
        val_samples (list): Validation samples.
        test_samples (list): Test samples.

    Returns:
        None
    """
    # Multi-model approach
    input_kind = MULTIMODEL
    write_to_file(OUTPUT_FILE, "\nInput kind = {input_kind} \n")

    # Train detector using multi-model with early stopping
    detector = train_by_transformer_with_multimodel_and_early_stop(
        train_samples,
        val_samples,
        input_kind,
    )

    # Evaluate on train set
    write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n")
    test_by_transformer_with_multimodel(detector, train_samples, input_kind)

    # Evaluate on validation set
    write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n")
    test_by_transformer_with_multimodel(detector, val_samples, input_kind)

    # Evaluate on test set
    write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n")
    test_by_transformer_with_multimodel(detector, test_samples, input_kind)

    # Single from multi-model approach
    input_kind = SINGLE_FROM_MULTIMODEL
    write_to_file(OUTPUT_FILE, "\nInput kind = {input_kind} \n")

    # Train detector using single from multi-model with early stopping
    detector = train_by_transformer_with_multimodel_and_early_stop(
        train_samples,
        val_samples,
        input_kind,
    )

    # Evaluate on train set
    write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n")
    test_by_transformer_with_multimodel(detector, train_samples, input_kind)

    # Evaluate on validation set
    write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n")
    test_by_transformer_with_multimodel(detector, val_samples, input_kind)

    # Evaluate on test set
    write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n")
    test_by_transformer_with_multimodel(detector, test_samples, input_kind)

    # Feature kind approach
    sample_length = len(train_samples[0]["text"])
    if (
        sample_length == 2
    ):  # Check if the sample length is 2, indicating BART feature kind
        feature_kind = BART
        write_to_file(OUTPUT_FILE, "\nFeature kind = {feature_kind} \n")

        # Train detector using feature kind
        detector = train_by_feature_kind(train_samples, feature_kind)

        # Evaluate on train set
        write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n")
        test_by_feature_kind(detector, train_samples, feature_kind)

        # Evaluate on validation set
        write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n")
        test_by_feature_kind(detector, val_samples, feature_kind)

        # Evaluate on test set
        write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n")
        test_by_feature_kind(detector, test_samples, feature_kind)


def process_multi_models_with_validation(
    multimodel_csv_file,
    train_indices,
    test_indices,
    num_samples,
):
    """
    Processes multi-model data with validation, training, and testing.

    Args:
        multimodel_csv_file (str): Path to the CSV file containing
            multi-model data.
        train_indices (list): Indices for the training data.
        test_indices (list): Indices for the testing data.
        num_samples (int): Number of samples to process.

    Returns:
        None
    """
    # Log the details of the process
    write_to_file(OUTPUT_FILE, f"PROCESSING FILE={multimodel_csv_file} \n")
    write_to_file(OUTPUT_FILE, f"EXPERIMENT WITH {MODEL_NAME} model \n")
    write_to_file(
        OUTPUT_FILE,
        f"NUMBER OF MAX EPOCHS WITH EARLY STOPPING =\
            {NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING} \n",
    )
    write_to_file(OUTPUT_FILE, f"PATIENCE = {PATIENCE} \n")
    write_to_file(OUTPUT_FILE, f"OPTIMIZED METRIC = {OPTIMIZED_METRIC} \n")
    write_to_file(OUTPUT_FILE, f"BATCH SIZE = {BATCH_SIZE} \n")
    write_to_file(OUTPUT_FILE, f"Number of samples = {num_samples} \n")

    # Read multi-model data from the CSV file
    data = parse_multimodal_data(multimodel_csv_file)

    # Limit data to the specified number of samples
    data = data[:num_samples]

    # Distribute data into training, validation, and testing sets
    train_samples, val_samples, test_samples = distribute_data(
        data,
        train_indices,
        test_indices,
        TRAIN_RATIO,
        VAL_RATIO,
    )

    # Log the training and testing indices
    write_to_file(
        OUTPUT_FILE,
        f"Multimodel training with train indices {train_indices},\
            test with test indices {test_indices} \n",
    )

    # Process the multi-models for training, validation, and testing
    general_process_multimodels_train_val_test(
        train_samples,
        val_samples,
        test_samples,
    )


def split_train_val_test(data, train_ratio, val_ratio):
    """
    Splits the dataset into training, validation, and test sets based on
        specified ratios.

    Args:
        data (list): The dataset to be split.
        train_ratio (float): The ratio of the dataset to be used for training.
        val_ratio (float): The ratio of the dataset to be used for validation.

    Returns:
        tuple: A tuple containing three lists
            (train_data, val_data, test_data).
    """
    # Calculate the number of samples for the training set
    num_train_samples = int(len(data) * train_ratio)

    # Calculate the number of samples for the validation set
    num_val_samples = int(len(data) * val_ratio)

    # Split the data into training, validation, and test sets
    train_data = data[:num_train_samples]
    val_data = data[num_train_samples : (num_train_samples + num_val_samples)]
    test_data = data[(num_train_samples + num_val_samples) :]

    return train_data, val_data, test_data