pmkhanh7890's picture
1st
22e1b62
raw
history blame
26.3 kB
import os
import shutil
from copy import deepcopy
import numpy as np
from config import (
BART,
BATCH_SIZE,
HUMAN_LABEL,
LEARNING_RATES,
MACHINE_LABEL,
MODEL_NAME,
MULTIMODEL,
NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING,
OPTIMIZED_METRIC,
PATIENCE,
ROBERTA_MODEL_PATHS,
SINGLE_FROM_MULTIMODEL,
TRAIN_RATIO,
VAL_RATIO,
tokenizer,
)
from datasets import Dataset
from sklearn.base import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier
from transformers import (
AutoModelForSequenceClassification,
DataCollatorWithPadding,
EarlyStoppingCallback,
Trainer,
TrainerCallback,
TrainingArguments,
)
from texts.bart_score import (
bart_score_in_batch,
extract_feature_in_batch,
)
from texts.config import OUTPUT_FILE
from texts.evaluation import compute_metrics
from texts.utils import (
check_error,
combine_text_with_BERT_format,
parse_multimodal_data,
write_to_file,
)
class TextDetector:
def __init__(self) -> None:
self.model = None
self.multimodel = None
self.train_data = None
self.val_data = None
self.test_data = None
self.train_features = None
self.val_features = None
self.test_features
def text_analysis(text: str) -> float:
score = 0.0
return score
class CustomCallback(TrainerCallback):
"""
Custom callback to evaluate the training dataset at the end of each epoch.
"""
def __init__(self, trainer) -> None:
super().__init__()
self._trainer = trainer
def on_epoch_end(self, args, state, control, **kwargs):
"""
At the end of each epoch, evaluate the training dataset.
"""
if control.should_evaluate:
control_copy = deepcopy(control)
self._trainer.evaluate(
eval_dataset=self._trainer.train_dataset,
metric_key_prefix="train",
)
return control_copy
def abstract_train(features, labels):
"""
Trains a model using the given features and labels.
Args:
features (list): The input features for training.
labels (list): The target labels for training.
Returns:
object: The trained model.
"""
model = MLPClassifier()
model.fit(features, labels)
return model
def evaluate_model(model, features, labels):
"""
Evaluates the model's performance using accuracy and ROC AUC scores.
Args:
model (object): The trained model to evaluate.
features (list): The input features for evaluation.
labels (list): The target labels for evaluation.
Returns:
None
"""
predictions = model.predict(features)
rounded_predictions = [round(value) for value in predictions]
accuracy = accuracy_score(labels, rounded_predictions)
write_to_file(OUTPUT_FILE, f"Accuracy: {accuracy * 100.0:.1f}%\n")
roc_auc = roc_auc_score(labels, rounded_predictions)
write_to_file(OUTPUT_FILE, f"ROC AUC: {roc_auc * 100.0:.1f}%\n")
def preprocess_function_multimodel(sample):
"""
Preprocesses a given sample for a multi-model setup by calculating
BART scores and formatting the text for BERT input.
Args:
sample (dict): A dictionary containing a key "text", which is a list of
lists of strings.
Returns:
dict: A dictionary containing tokenized and preprocessed text data.
"""
num_texts = len(sample["text"][0]) # Number of texts in each sub-sample
texts_grouped_by_index = [
[] for _ in range(num_texts)
] # Initialize empty lists for grouping texts by index
# Group texts by their index across sub-samples
for sub_sample in sample["text"]:
for i in range(num_texts):
texts_grouped_by_index[i].append(sub_sample[i])
# Calculate BART scores for each text pair (text[0] with text[i])
bart_scores = [
bart_score_in_batch(
texts_grouped_by_index[0],
texts_grouped_by_index[i],
)
for i in range(1, num_texts)
]
combined_texts = []
# Process each sub-sample for BERT input
for index, sub_sample in enumerate(sample["text"]):
text_array = [sub_sample[0]] # Start with the input text
score_generation_pairs = []
# Pair scores with their corresponding generations
for i in range(1, num_texts):
generation_text = sub_sample[i]
generation_score = bart_scores[i - 1][index]
score_generation_pairs.append((generation_score, generation_text))
# Sort pairs by score in descending order
sorted_pairs = sorted(score_generation_pairs, reverse=True)
# Append sorted texts to text_array
for _, sorted_text in sorted_pairs:
text_array.append(sorted_text)
# Combine texts into a single BERT-formatted string
combined_text = combine_text_with_BERT_format(text_array)
combined_texts.append(combined_text)
# Tokenize the combined texts for BERT
return tokenizer(combined_texts, add_special_tokens=False, truncation=True)
def preprocess_function_single_from_multimodel(sample):
"""
Extracts the first text from each sub-sample in a multi-model sample and
tokenizes it.
Args:
sample (dict): A dictionary containing a key "text", which is a list of
lists of strings.
Returns:
dict: A dictionary containing tokenized text data.
"""
combined_texts = []
# Iterate through each sub-sample
for sub_sample in sample["text"]:
input_text = sub_sample[
0
] # Extract the first text from the sub-sample
combined_texts.append(
input_text,
) # Append it to the list of combined texts
# Tokenize the combined texts
return tokenizer(combined_texts, truncation=True)
def train_only_by_transformer_with_test_evaluation_early_stop(
train_data,
test_data,
input_type,
num_classes=2,
):
"""
Trains a transformer model using the provided training and testing
datasets with early stopping.
Args:
train_data (Dataset): The training dataset.
test_data (Dataset): The testing dataset.
input_type (str): The type of input data, either MULTIMODEL or
SINGLE_FROM_MULTIMODEL.
num_classes (int, optional): The number of classes for classification.
Defaults to 2.
Returns:
Trainer: The trained model wrapped in a Trainer object.
"""
# Preprocess datasets based on the input type
if input_type == MULTIMODEL:
train_data = train_data.map(
preprocess_function_multimodel,
batched=True,
)
test_data = test_data.map(preprocess_function_multimodel, batched=True)
elif input_type == SINGLE_FROM_MULTIMODEL:
train_data = train_data.map(
preprocess_function_single_from_multimodel,
batched=True,
)
test_data = test_data.map(
preprocess_function_single_from_multimodel,
batched=True,
)
# Data collator to pad inputs
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Load appropriate model based on number of classes
if num_classes == 3:
model = AutoModelForSequenceClassification.from_pretrained(
"pretrained_model/roberta-base_num_labels_3",
num_labels=num_classes,
)
else:
model = AutoModelForSequenceClassification.from_pretrained(
ROBERTA_MODEL_PATHS[MODEL_NAME],
num_labels=num_classes,
)
learning_rate = LEARNING_RATES[MODEL_NAME]
output_folder = "training_with_callbacks"
# Remove the output folder if it already exists
if os.path.exists(output_folder):
shutil.rmtree(output_folder)
# Training arguments
training_args = TrainingArguments(
output_dir=output_folder,
evaluation_strategy="epoch",
logging_strategy="epoch",
save_strategy="epoch",
learning_rate=learning_rate,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
num_train_epochs=NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING,
weight_decay=0.01,
push_to_hub=False,
metric_for_best_model=OPTIMIZED_METRIC,
load_best_model_at_end=True,
)
# Create Trainer object
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_data,
eval_dataset=test_data,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)],
)
# Add custom callback
trainer.add_callback(CustomCallback(trainer))
# Start training
trainer.train()
return trainer
def create_pair_sample(data_item, training_indices):
"""
Creates pair samples for training by comparing human data with
machine-generated data.
Args:
data_item (dict): A dictionary containing 'human', 'single',
and 'pair' data.
training_indices (list): A list of indices used for training.
Returns:
list: A list of dictionaries, each containing a 'text' array
and a 'label'.
"""
# Initialize the result list
result_samples = []
# Check if there is any error in the data_item
if check_error(data_item):
return result_samples
# Create machine samples
for train_idx in training_indices:
if data_item["human"] != data_item["single"][train_idx]:
text_array = []
machine_text = data_item["single"][train_idx]
text_array.append(machine_text)
for sub_idx in training_indices:
text_array.append(data_item["pair"][train_idx][sub_idx])
sample = {
"text": text_array,
"label": MACHINE_LABEL,
}
result_samples.append(sample)
# Create human samples
text_array = [data_item["human"]]
for train_idx in training_indices:
text_array.append(data_item["single"][train_idx])
human_sample = {
"text": text_array,
"label": HUMAN_LABEL,
}
# Append human samples for each machine sample
num_machine_samples = len(result_samples)
for _ in range(num_machine_samples):
result_samples.append(human_sample)
return result_samples
def create_pair_test_sample(data_item, training_indices, testing_indices):
"""
Creates pair test samples by comparing human data with
machine-generated data.
Args:
data_item (dict): A dictionary containing 'human', 'single', and
'pair' data.
training_indices (list): A list of indices used for training.
testing_indices (list): A list of indices used for testing.
Returns:
list: A list of dictionaries, each containing a 'text' array and a
'label'.
"""
# Initialize the result list
result_samples = []
# Check if there is any error in the data_item
if check_error(data_item):
return result_samples
# Create machine samples based on testing indices
for test_idx in testing_indices:
if data_item["human"] != data_item["single"][test_idx]:
text_array = []
machine_text = data_item["single"][test_idx]
text_array.append(machine_text)
for train_idx in training_indices:
text_array.append(data_item["pair"][test_idx][train_idx])
sample = {
"text": text_array,
"label": MACHINE_LABEL,
}
result_samples.append(sample)
# Create human sample
text_array = [data_item["human"]]
for train_idx in training_indices:
text_array.append(data_item["single"][train_idx])
human_sample = {
"text": text_array,
"label": HUMAN_LABEL,
}
# Append the human sample for each machine sample
num_machine_samples = len(result_samples)
for _ in range(num_machine_samples):
result_samples.append(human_sample)
return result_samples
def create_train_val_sample(data, training_indices):
"""
Creates training and validation samples from the provided data.
Args:
data (list): A list of data items, each to be processed.
training_indices (list): A list of indices used for training.
Returns:
list: A list of training and validation samples created from the data.
"""
# Initialize the result list
result_samples = []
# Process each item in the data
for data_item in data:
# Create pair samples for the current item
sub_samples = create_pair_sample(data_item, training_indices)
# Extend the result list with the created sub-samples
result_samples.extend(sub_samples)
return result_samples
def create_test_sample(data, training_indices, testing_indices):
"""
Creates test samples from the provided data by comparing human data with
machine-generated data.
Args:
data (list): A list of data items, each to be processed.
training_indices (list): A list of indices used for training.
testing_indices (list): A list of indices used for testing.
Returns:
list: A list of test samples created from the data.
"""
# Initialize the result list
result_samples = []
# Process each item in the data
for data_item in data:
# Create pair test samples for the current item
sub_samples = create_pair_test_sample(
data_item,
training_indices,
testing_indices,
)
# Extend the result list with the created sub-samples
result_samples.extend(sub_samples)
return result_samples
def distribute_data(data, train_indices, test_indices, train_ratio, val_ratio):
"""
Distributes the data into training, validation, and test samples.
Args:
data (list): A list of data items to be split and processed.
train_indices (list): A list of indices used for training.
test_indices (list): A list of indices used for testing.
train_ratio (float): The ratio of data to be used for training.
val_ratio (float): The ratio of data to be used for validation.
Returns:
tuple: A tuple containing lists of training, validation,
and test samples.
"""
# Split the data into training, validation, and test sets
train_data, val_data, test_data = split_train_val_test(
data,
train_ratio,
val_ratio,
)
# Create training samples
train_samples = create_train_val_sample(train_data, train_indices)
write_to_file(OUTPUT_FILE, f"train samples = {len(train_samples)}\n")
# Create validation samples
val_samples = create_train_val_sample(val_data, train_indices)
write_to_file(OUTPUT_FILE, f"val samples = {len(val_samples)}\n")
# Create test samples
test_samples = create_test_sample(test_data, train_indices, test_indices)
write_to_file(OUTPUT_FILE, f"test samples = {len(test_samples)}\n")
return train_samples, val_samples, test_samples
def convert_to_huggingface_with_multimodel(samples):
"""
Converts a list of samples to the Hugging Face Dataset format.
Args:
samples (list): A list of samples to be converted.
Returns:
Dataset: A Hugging Face Dataset object created from the samples.
"""
return Dataset.from_list(samples)
def train_by_transformer_with_multimodel_and_early_stop(
train_samples,
val_samples,
input_type,
):
"""
Trains a transformer model with multimodal data and early stopping.
Args:
train_samples (list): A list of training samples.
val_samples (list): A list of validation samples.
input_type (str): The type of input data (e.g., multimodal).
Returns:
object: The trained model with early stopping.
"""
# Convert training and validation samples to Hugging Face Dataset format
train_data = convert_to_huggingface_with_multimodel(train_samples)
val_data = convert_to_huggingface_with_multimodel(val_samples)
# Train the model with early stopping and return the trained model
return train_only_by_transformer_with_test_evaluation_early_stop(
train_data,
val_data,
input_type,
)
def test_by_transformer_with_multimodel(detector, test_samples, input_type):
"""
Tests a trained transformer model with multimodal data.
Args:
detector (object): The trained model to be evaluated.
test_samples (list): A list of test samples.
input_type (str): The type of input data (e.g., multimodal).
Returns:
None
"""
# Convert test samples to Hugging Face Dataset format
test_data = convert_to_huggingface_with_multimodel(test_samples)
# Apply the appropriate preprocessing function based on the input type
if input_type == MULTIMODEL:
test_data = test_data.map(preprocess_function_multimodel, batched=True)
elif input_type == SINGLE_FROM_MULTIMODEL:
test_data = test_data.map(
preprocess_function_single_from_multimodel,
batched=True,
)
# Evaluate the model on the test data
result = detector.evaluate(eval_dataset=test_data)
# Extract and log the ROC AUC score
roc_auc = result["eval_roc_auc"]
write_to_file(OUTPUT_FILE, "roc_auc: %.1f%%" % (roc_auc * 100.0) + "\n")
def extract_by_feature_kind(samples, feature_type):
"""
Extracts features from the given samples based on the specified feature
type.
Args:
samples (list): A list of samples where each sample is a dictionary
with 'text' and 'label' keys.
feature_type (str): The type of feature to extract.
Returns:
tuple: A tuple containing the extracted features and corresponding
labels.
"""
text_1_list = []
text_2_list = []
labels = []
for sample in samples:
text_1_list.append(sample["text"][0])
text_2_list.append(sample["text"][1])
labels.append(sample["label"])
# Extract features in batch based on the feature type
features = extract_feature_in_batch(text_1_list, text_2_list, feature_type)
return features, labels
def train_by_feature_kind(train_samples, feature_type):
"""
Trains a model using features extracted from the training samples based on
the specified feature type.
Args:
train_samples (list): A list of training samples where each sample is
a dictionary with 'text' and 'label' keys.
feature_type (str): The type of feature to extract for training.
Returns:
object: The trained model.
"""
# Extract features and labels from the training samples
features, labels = extract_by_feature_kind(train_samples, feature_type)
# Convert features to a numpy array and reshape for training
features = np.array(features)
features = features.reshape(-1, 1)
# Train the model using the extracted features and labels
model = abstract_train(features, labels)
return model
def test_by_feature_kind(detector, samples, feature_type):
"""
Tests a detector using features extracted from the provided samples based
on the specified feature type.
Args:
detector (object): The detector model to be evaluated.
samples (list): A list of samples where each sample is a dictionary
with 'text' and 'label' keys.
feature_type (str): The type of feature to extract for testing.
Returns:
None
"""
# Extract features and labels from the samples
features, labels = extract_by_feature_kind(samples, feature_type)
# Convert features to a numpy array and reshape for evaluation
features = np.array(features)
features = features.reshape(-1, 1)
# Evaluate the detector model using the extracted features and labels
evaluate_model(detector, features, labels)
def general_process_multimodels_train_val_test(
train_samples,
val_samples,
test_samples,
):
"""
General process for training, validating, and testing models using
multi-model and feature kind approaches.
Args:
train_samples (list): Training samples.
val_samples (list): Validation samples.
test_samples (list): Test samples.
Returns:
None
"""
# Multi-model approach
input_kind = MULTIMODEL
write_to_file(OUTPUT_FILE, "\nInput kind = {input_kind} \n")
# Train detector using multi-model with early stopping
detector = train_by_transformer_with_multimodel_and_early_stop(
train_samples,
val_samples,
input_kind,
)
# Evaluate on train set
write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n")
test_by_transformer_with_multimodel(detector, train_samples, input_kind)
# Evaluate on validation set
write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n")
test_by_transformer_with_multimodel(detector, val_samples, input_kind)
# Evaluate on test set
write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n")
test_by_transformer_with_multimodel(detector, test_samples, input_kind)
# Single from multi-model approach
input_kind = SINGLE_FROM_MULTIMODEL
write_to_file(OUTPUT_FILE, "\nInput kind = {input_kind} \n")
# Train detector using single from multi-model with early stopping
detector = train_by_transformer_with_multimodel_and_early_stop(
train_samples,
val_samples,
input_kind,
)
# Evaluate on train set
write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n")
test_by_transformer_with_multimodel(detector, train_samples, input_kind)
# Evaluate on validation set
write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n")
test_by_transformer_with_multimodel(detector, val_samples, input_kind)
# Evaluate on test set
write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n")
test_by_transformer_with_multimodel(detector, test_samples, input_kind)
# Feature kind approach
sample_length = len(train_samples[0]["text"])
if (
sample_length == 2
): # Check if the sample length is 2, indicating BART feature kind
feature_kind = BART
write_to_file(OUTPUT_FILE, "\nFeature kind = {feature_kind} \n")
# Train detector using feature kind
detector = train_by_feature_kind(train_samples, feature_kind)
# Evaluate on train set
write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n")
test_by_feature_kind(detector, train_samples, feature_kind)
# Evaluate on validation set
write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n")
test_by_feature_kind(detector, val_samples, feature_kind)
# Evaluate on test set
write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n")
test_by_feature_kind(detector, test_samples, feature_kind)
def process_multi_models_with_validation(
multimodel_csv_file,
train_indices,
test_indices,
num_samples,
):
"""
Processes multi-model data with validation, training, and testing.
Args:
multimodel_csv_file (str): Path to the CSV file containing
multi-model data.
train_indices (list): Indices for the training data.
test_indices (list): Indices for the testing data.
num_samples (int): Number of samples to process.
Returns:
None
"""
# Log the details of the process
write_to_file(OUTPUT_FILE, f"PROCESSING FILE={multimodel_csv_file} \n")
write_to_file(OUTPUT_FILE, f"EXPERIMENT WITH {MODEL_NAME} model \n")
write_to_file(
OUTPUT_FILE,
f"NUMBER OF MAX EPOCHS WITH EARLY STOPPING =\
{NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING} \n",
)
write_to_file(OUTPUT_FILE, f"PATIENCE = {PATIENCE} \n")
write_to_file(OUTPUT_FILE, f"OPTIMIZED METRIC = {OPTIMIZED_METRIC} \n")
write_to_file(OUTPUT_FILE, f"BATCH SIZE = {BATCH_SIZE} \n")
write_to_file(OUTPUT_FILE, f"Number of samples = {num_samples} \n")
# Read multi-model data from the CSV file
data = parse_multimodal_data(multimodel_csv_file)
# Limit data to the specified number of samples
data = data[:num_samples]
# Distribute data into training, validation, and testing sets
train_samples, val_samples, test_samples = distribute_data(
data,
train_indices,
test_indices,
TRAIN_RATIO,
VAL_RATIO,
)
# Log the training and testing indices
write_to_file(
OUTPUT_FILE,
f"Multimodel training with train indices {train_indices},\
test with test indices {test_indices} \n",
)
# Process the multi-models for training, validation, and testing
general_process_multimodels_train_val_test(
train_samples,
val_samples,
test_samples,
)
def split_train_val_test(data, train_ratio, val_ratio):
"""
Splits the dataset into training, validation, and test sets based on
specified ratios.
Args:
data (list): The dataset to be split.
train_ratio (float): The ratio of the dataset to be used for training.
val_ratio (float): The ratio of the dataset to be used for validation.
Returns:
tuple: A tuple containing three lists
(train_data, val_data, test_data).
"""
# Calculate the number of samples for the training set
num_train_samples = int(len(data) * train_ratio)
# Calculate the number of samples for the validation set
num_val_samples = int(len(data) * val_ratio)
# Split the data into training, validation, and test sets
train_data = data[:num_train_samples]
val_data = data[num_train_samples : (num_train_samples + num_val_samples)]
test_data = data[(num_train_samples + num_val_samples) :]
return train_data, val_data, test_data