Spaces:
Sleeping
Sleeping
import os | |
import shutil | |
from copy import deepcopy | |
import numpy as np | |
from config import ( | |
BART, | |
BATCH_SIZE, | |
HUMAN_LABEL, | |
LEARNING_RATES, | |
MACHINE_LABEL, | |
MODEL_NAME, | |
MULTIMODEL, | |
NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING, | |
OPTIMIZED_METRIC, | |
PATIENCE, | |
ROBERTA_MODEL_PATHS, | |
SINGLE_FROM_MULTIMODEL, | |
TRAIN_RATIO, | |
VAL_RATIO, | |
tokenizer, | |
) | |
from datasets import Dataset | |
from sklearn.base import accuracy_score | |
from sklearn.metrics import roc_auc_score | |
from sklearn.neural_network import MLPClassifier | |
from transformers import ( | |
AutoModelForSequenceClassification, | |
DataCollatorWithPadding, | |
EarlyStoppingCallback, | |
Trainer, | |
TrainerCallback, | |
TrainingArguments, | |
) | |
from texts.bart_score import ( | |
bart_score_in_batch, | |
extract_feature_in_batch, | |
) | |
from texts.config import OUTPUT_FILE | |
from texts.evaluation import compute_metrics | |
from texts.utils import ( | |
check_error, | |
combine_text_with_BERT_format, | |
parse_multimodal_data, | |
write_to_file, | |
) | |
class TextDetector: | |
def __init__(self) -> None: | |
self.model = None | |
self.multimodel = None | |
self.train_data = None | |
self.val_data = None | |
self.test_data = None | |
self.train_features = None | |
self.val_features = None | |
self.test_features | |
def text_analysis(text: str) -> float: | |
score = 0.0 | |
return score | |
class CustomCallback(TrainerCallback): | |
""" | |
Custom callback to evaluate the training dataset at the end of each epoch. | |
""" | |
def __init__(self, trainer) -> None: | |
super().__init__() | |
self._trainer = trainer | |
def on_epoch_end(self, args, state, control, **kwargs): | |
""" | |
At the end of each epoch, evaluate the training dataset. | |
""" | |
if control.should_evaluate: | |
control_copy = deepcopy(control) | |
self._trainer.evaluate( | |
eval_dataset=self._trainer.train_dataset, | |
metric_key_prefix="train", | |
) | |
return control_copy | |
def abstract_train(features, labels): | |
""" | |
Trains a model using the given features and labels. | |
Args: | |
features (list): The input features for training. | |
labels (list): The target labels for training. | |
Returns: | |
object: The trained model. | |
""" | |
model = MLPClassifier() | |
model.fit(features, labels) | |
return model | |
def evaluate_model(model, features, labels): | |
""" | |
Evaluates the model's performance using accuracy and ROC AUC scores. | |
Args: | |
model (object): The trained model to evaluate. | |
features (list): The input features for evaluation. | |
labels (list): The target labels for evaluation. | |
Returns: | |
None | |
""" | |
predictions = model.predict(features) | |
rounded_predictions = [round(value) for value in predictions] | |
accuracy = accuracy_score(labels, rounded_predictions) | |
write_to_file(OUTPUT_FILE, f"Accuracy: {accuracy * 100.0:.1f}%\n") | |
roc_auc = roc_auc_score(labels, rounded_predictions) | |
write_to_file(OUTPUT_FILE, f"ROC AUC: {roc_auc * 100.0:.1f}%\n") | |
def preprocess_function_multimodel(sample): | |
""" | |
Preprocesses a given sample for a multi-model setup by calculating | |
BART scores and formatting the text for BERT input. | |
Args: | |
sample (dict): A dictionary containing a key "text", which is a list of | |
lists of strings. | |
Returns: | |
dict: A dictionary containing tokenized and preprocessed text data. | |
""" | |
num_texts = len(sample["text"][0]) # Number of texts in each sub-sample | |
texts_grouped_by_index = [ | |
[] for _ in range(num_texts) | |
] # Initialize empty lists for grouping texts by index | |
# Group texts by their index across sub-samples | |
for sub_sample in sample["text"]: | |
for i in range(num_texts): | |
texts_grouped_by_index[i].append(sub_sample[i]) | |
# Calculate BART scores for each text pair (text[0] with text[i]) | |
bart_scores = [ | |
bart_score_in_batch( | |
texts_grouped_by_index[0], | |
texts_grouped_by_index[i], | |
) | |
for i in range(1, num_texts) | |
] | |
combined_texts = [] | |
# Process each sub-sample for BERT input | |
for index, sub_sample in enumerate(sample["text"]): | |
text_array = [sub_sample[0]] # Start with the input text | |
score_generation_pairs = [] | |
# Pair scores with their corresponding generations | |
for i in range(1, num_texts): | |
generation_text = sub_sample[i] | |
generation_score = bart_scores[i - 1][index] | |
score_generation_pairs.append((generation_score, generation_text)) | |
# Sort pairs by score in descending order | |
sorted_pairs = sorted(score_generation_pairs, reverse=True) | |
# Append sorted texts to text_array | |
for _, sorted_text in sorted_pairs: | |
text_array.append(sorted_text) | |
# Combine texts into a single BERT-formatted string | |
combined_text = combine_text_with_BERT_format(text_array) | |
combined_texts.append(combined_text) | |
# Tokenize the combined texts for BERT | |
return tokenizer(combined_texts, add_special_tokens=False, truncation=True) | |
def preprocess_function_single_from_multimodel(sample): | |
""" | |
Extracts the first text from each sub-sample in a multi-model sample and | |
tokenizes it. | |
Args: | |
sample (dict): A dictionary containing a key "text", which is a list of | |
lists of strings. | |
Returns: | |
dict: A dictionary containing tokenized text data. | |
""" | |
combined_texts = [] | |
# Iterate through each sub-sample | |
for sub_sample in sample["text"]: | |
input_text = sub_sample[ | |
0 | |
] # Extract the first text from the sub-sample | |
combined_texts.append( | |
input_text, | |
) # Append it to the list of combined texts | |
# Tokenize the combined texts | |
return tokenizer(combined_texts, truncation=True) | |
def train_only_by_transformer_with_test_evaluation_early_stop( | |
train_data, | |
test_data, | |
input_type, | |
num_classes=2, | |
): | |
""" | |
Trains a transformer model using the provided training and testing | |
datasets with early stopping. | |
Args: | |
train_data (Dataset): The training dataset. | |
test_data (Dataset): The testing dataset. | |
input_type (str): The type of input data, either MULTIMODEL or | |
SINGLE_FROM_MULTIMODEL. | |
num_classes (int, optional): The number of classes for classification. | |
Defaults to 2. | |
Returns: | |
Trainer: The trained model wrapped in a Trainer object. | |
""" | |
# Preprocess datasets based on the input type | |
if input_type == MULTIMODEL: | |
train_data = train_data.map( | |
preprocess_function_multimodel, | |
batched=True, | |
) | |
test_data = test_data.map(preprocess_function_multimodel, batched=True) | |
elif input_type == SINGLE_FROM_MULTIMODEL: | |
train_data = train_data.map( | |
preprocess_function_single_from_multimodel, | |
batched=True, | |
) | |
test_data = test_data.map( | |
preprocess_function_single_from_multimodel, | |
batched=True, | |
) | |
# Data collator to pad inputs | |
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
# Load appropriate model based on number of classes | |
if num_classes == 3: | |
model = AutoModelForSequenceClassification.from_pretrained( | |
"pretrained_model/roberta-base_num_labels_3", | |
num_labels=num_classes, | |
) | |
else: | |
model = AutoModelForSequenceClassification.from_pretrained( | |
ROBERTA_MODEL_PATHS[MODEL_NAME], | |
num_labels=num_classes, | |
) | |
learning_rate = LEARNING_RATES[MODEL_NAME] | |
output_folder = "training_with_callbacks" | |
# Remove the output folder if it already exists | |
if os.path.exists(output_folder): | |
shutil.rmtree(output_folder) | |
# Training arguments | |
training_args = TrainingArguments( | |
output_dir=output_folder, | |
evaluation_strategy="epoch", | |
logging_strategy="epoch", | |
save_strategy="epoch", | |
learning_rate=learning_rate, | |
per_device_train_batch_size=BATCH_SIZE, | |
per_device_eval_batch_size=BATCH_SIZE, | |
num_train_epochs=NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING, | |
weight_decay=0.01, | |
push_to_hub=False, | |
metric_for_best_model=OPTIMIZED_METRIC, | |
load_best_model_at_end=True, | |
) | |
# Create Trainer object | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=train_data, | |
eval_dataset=test_data, | |
tokenizer=tokenizer, | |
data_collator=data_collator, | |
compute_metrics=compute_metrics, | |
callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)], | |
) | |
# Add custom callback | |
trainer.add_callback(CustomCallback(trainer)) | |
# Start training | |
trainer.train() | |
return trainer | |
def create_pair_sample(data_item, training_indices): | |
""" | |
Creates pair samples for training by comparing human data with | |
machine-generated data. | |
Args: | |
data_item (dict): A dictionary containing 'human', 'single', | |
and 'pair' data. | |
training_indices (list): A list of indices used for training. | |
Returns: | |
list: A list of dictionaries, each containing a 'text' array | |
and a 'label'. | |
""" | |
# Initialize the result list | |
result_samples = [] | |
# Check if there is any error in the data_item | |
if check_error(data_item): | |
return result_samples | |
# Create machine samples | |
for train_idx in training_indices: | |
if data_item["human"] != data_item["single"][train_idx]: | |
text_array = [] | |
machine_text = data_item["single"][train_idx] | |
text_array.append(machine_text) | |
for sub_idx in training_indices: | |
text_array.append(data_item["pair"][train_idx][sub_idx]) | |
sample = { | |
"text": text_array, | |
"label": MACHINE_LABEL, | |
} | |
result_samples.append(sample) | |
# Create human samples | |
text_array = [data_item["human"]] | |
for train_idx in training_indices: | |
text_array.append(data_item["single"][train_idx]) | |
human_sample = { | |
"text": text_array, | |
"label": HUMAN_LABEL, | |
} | |
# Append human samples for each machine sample | |
num_machine_samples = len(result_samples) | |
for _ in range(num_machine_samples): | |
result_samples.append(human_sample) | |
return result_samples | |
def create_pair_test_sample(data_item, training_indices, testing_indices): | |
""" | |
Creates pair test samples by comparing human data with | |
machine-generated data. | |
Args: | |
data_item (dict): A dictionary containing 'human', 'single', and | |
'pair' data. | |
training_indices (list): A list of indices used for training. | |
testing_indices (list): A list of indices used for testing. | |
Returns: | |
list: A list of dictionaries, each containing a 'text' array and a | |
'label'. | |
""" | |
# Initialize the result list | |
result_samples = [] | |
# Check if there is any error in the data_item | |
if check_error(data_item): | |
return result_samples | |
# Create machine samples based on testing indices | |
for test_idx in testing_indices: | |
if data_item["human"] != data_item["single"][test_idx]: | |
text_array = [] | |
machine_text = data_item["single"][test_idx] | |
text_array.append(machine_text) | |
for train_idx in training_indices: | |
text_array.append(data_item["pair"][test_idx][train_idx]) | |
sample = { | |
"text": text_array, | |
"label": MACHINE_LABEL, | |
} | |
result_samples.append(sample) | |
# Create human sample | |
text_array = [data_item["human"]] | |
for train_idx in training_indices: | |
text_array.append(data_item["single"][train_idx]) | |
human_sample = { | |
"text": text_array, | |
"label": HUMAN_LABEL, | |
} | |
# Append the human sample for each machine sample | |
num_machine_samples = len(result_samples) | |
for _ in range(num_machine_samples): | |
result_samples.append(human_sample) | |
return result_samples | |
def create_train_val_sample(data, training_indices): | |
""" | |
Creates training and validation samples from the provided data. | |
Args: | |
data (list): A list of data items, each to be processed. | |
training_indices (list): A list of indices used for training. | |
Returns: | |
list: A list of training and validation samples created from the data. | |
""" | |
# Initialize the result list | |
result_samples = [] | |
# Process each item in the data | |
for data_item in data: | |
# Create pair samples for the current item | |
sub_samples = create_pair_sample(data_item, training_indices) | |
# Extend the result list with the created sub-samples | |
result_samples.extend(sub_samples) | |
return result_samples | |
def create_test_sample(data, training_indices, testing_indices): | |
""" | |
Creates test samples from the provided data by comparing human data with | |
machine-generated data. | |
Args: | |
data (list): A list of data items, each to be processed. | |
training_indices (list): A list of indices used for training. | |
testing_indices (list): A list of indices used for testing. | |
Returns: | |
list: A list of test samples created from the data. | |
""" | |
# Initialize the result list | |
result_samples = [] | |
# Process each item in the data | |
for data_item in data: | |
# Create pair test samples for the current item | |
sub_samples = create_pair_test_sample( | |
data_item, | |
training_indices, | |
testing_indices, | |
) | |
# Extend the result list with the created sub-samples | |
result_samples.extend(sub_samples) | |
return result_samples | |
def distribute_data(data, train_indices, test_indices, train_ratio, val_ratio): | |
""" | |
Distributes the data into training, validation, and test samples. | |
Args: | |
data (list): A list of data items to be split and processed. | |
train_indices (list): A list of indices used for training. | |
test_indices (list): A list of indices used for testing. | |
train_ratio (float): The ratio of data to be used for training. | |
val_ratio (float): The ratio of data to be used for validation. | |
Returns: | |
tuple: A tuple containing lists of training, validation, | |
and test samples. | |
""" | |
# Split the data into training, validation, and test sets | |
train_data, val_data, test_data = split_train_val_test( | |
data, | |
train_ratio, | |
val_ratio, | |
) | |
# Create training samples | |
train_samples = create_train_val_sample(train_data, train_indices) | |
write_to_file(OUTPUT_FILE, f"train samples = {len(train_samples)}\n") | |
# Create validation samples | |
val_samples = create_train_val_sample(val_data, train_indices) | |
write_to_file(OUTPUT_FILE, f"val samples = {len(val_samples)}\n") | |
# Create test samples | |
test_samples = create_test_sample(test_data, train_indices, test_indices) | |
write_to_file(OUTPUT_FILE, f"test samples = {len(test_samples)}\n") | |
return train_samples, val_samples, test_samples | |
def convert_to_huggingface_with_multimodel(samples): | |
""" | |
Converts a list of samples to the Hugging Face Dataset format. | |
Args: | |
samples (list): A list of samples to be converted. | |
Returns: | |
Dataset: A Hugging Face Dataset object created from the samples. | |
""" | |
return Dataset.from_list(samples) | |
def train_by_transformer_with_multimodel_and_early_stop( | |
train_samples, | |
val_samples, | |
input_type, | |
): | |
""" | |
Trains a transformer model with multimodal data and early stopping. | |
Args: | |
train_samples (list): A list of training samples. | |
val_samples (list): A list of validation samples. | |
input_type (str): The type of input data (e.g., multimodal). | |
Returns: | |
object: The trained model with early stopping. | |
""" | |
# Convert training and validation samples to Hugging Face Dataset format | |
train_data = convert_to_huggingface_with_multimodel(train_samples) | |
val_data = convert_to_huggingface_with_multimodel(val_samples) | |
# Train the model with early stopping and return the trained model | |
return train_only_by_transformer_with_test_evaluation_early_stop( | |
train_data, | |
val_data, | |
input_type, | |
) | |
def test_by_transformer_with_multimodel(detector, test_samples, input_type): | |
""" | |
Tests a trained transformer model with multimodal data. | |
Args: | |
detector (object): The trained model to be evaluated. | |
test_samples (list): A list of test samples. | |
input_type (str): The type of input data (e.g., multimodal). | |
Returns: | |
None | |
""" | |
# Convert test samples to Hugging Face Dataset format | |
test_data = convert_to_huggingface_with_multimodel(test_samples) | |
# Apply the appropriate preprocessing function based on the input type | |
if input_type == MULTIMODEL: | |
test_data = test_data.map(preprocess_function_multimodel, batched=True) | |
elif input_type == SINGLE_FROM_MULTIMODEL: | |
test_data = test_data.map( | |
preprocess_function_single_from_multimodel, | |
batched=True, | |
) | |
# Evaluate the model on the test data | |
result = detector.evaluate(eval_dataset=test_data) | |
# Extract and log the ROC AUC score | |
roc_auc = result["eval_roc_auc"] | |
write_to_file(OUTPUT_FILE, "roc_auc: %.1f%%" % (roc_auc * 100.0) + "\n") | |
def extract_by_feature_kind(samples, feature_type): | |
""" | |
Extracts features from the given samples based on the specified feature | |
type. | |
Args: | |
samples (list): A list of samples where each sample is a dictionary | |
with 'text' and 'label' keys. | |
feature_type (str): The type of feature to extract. | |
Returns: | |
tuple: A tuple containing the extracted features and corresponding | |
labels. | |
""" | |
text_1_list = [] | |
text_2_list = [] | |
labels = [] | |
for sample in samples: | |
text_1_list.append(sample["text"][0]) | |
text_2_list.append(sample["text"][1]) | |
labels.append(sample["label"]) | |
# Extract features in batch based on the feature type | |
features = extract_feature_in_batch(text_1_list, text_2_list, feature_type) | |
return features, labels | |
def train_by_feature_kind(train_samples, feature_type): | |
""" | |
Trains a model using features extracted from the training samples based on | |
the specified feature type. | |
Args: | |
train_samples (list): A list of training samples where each sample is | |
a dictionary with 'text' and 'label' keys. | |
feature_type (str): The type of feature to extract for training. | |
Returns: | |
object: The trained model. | |
""" | |
# Extract features and labels from the training samples | |
features, labels = extract_by_feature_kind(train_samples, feature_type) | |
# Convert features to a numpy array and reshape for training | |
features = np.array(features) | |
features = features.reshape(-1, 1) | |
# Train the model using the extracted features and labels | |
model = abstract_train(features, labels) | |
return model | |
def test_by_feature_kind(detector, samples, feature_type): | |
""" | |
Tests a detector using features extracted from the provided samples based | |
on the specified feature type. | |
Args: | |
detector (object): The detector model to be evaluated. | |
samples (list): A list of samples where each sample is a dictionary | |
with 'text' and 'label' keys. | |
feature_type (str): The type of feature to extract for testing. | |
Returns: | |
None | |
""" | |
# Extract features and labels from the samples | |
features, labels = extract_by_feature_kind(samples, feature_type) | |
# Convert features to a numpy array and reshape for evaluation | |
features = np.array(features) | |
features = features.reshape(-1, 1) | |
# Evaluate the detector model using the extracted features and labels | |
evaluate_model(detector, features, labels) | |
def general_process_multimodels_train_val_test( | |
train_samples, | |
val_samples, | |
test_samples, | |
): | |
""" | |
General process for training, validating, and testing models using | |
multi-model and feature kind approaches. | |
Args: | |
train_samples (list): Training samples. | |
val_samples (list): Validation samples. | |
test_samples (list): Test samples. | |
Returns: | |
None | |
""" | |
# Multi-model approach | |
input_kind = MULTIMODEL | |
write_to_file(OUTPUT_FILE, "\nInput kind = {input_kind} \n") | |
# Train detector using multi-model with early stopping | |
detector = train_by_transformer_with_multimodel_and_early_stop( | |
train_samples, | |
val_samples, | |
input_kind, | |
) | |
# Evaluate on train set | |
write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n") | |
test_by_transformer_with_multimodel(detector, train_samples, input_kind) | |
# Evaluate on validation set | |
write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n") | |
test_by_transformer_with_multimodel(detector, val_samples, input_kind) | |
# Evaluate on test set | |
write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n") | |
test_by_transformer_with_multimodel(detector, test_samples, input_kind) | |
# Single from multi-model approach | |
input_kind = SINGLE_FROM_MULTIMODEL | |
write_to_file(OUTPUT_FILE, "\nInput kind = {input_kind} \n") | |
# Train detector using single from multi-model with early stopping | |
detector = train_by_transformer_with_multimodel_and_early_stop( | |
train_samples, | |
val_samples, | |
input_kind, | |
) | |
# Evaluate on train set | |
write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n") | |
test_by_transformer_with_multimodel(detector, train_samples, input_kind) | |
# Evaluate on validation set | |
write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n") | |
test_by_transformer_with_multimodel(detector, val_samples, input_kind) | |
# Evaluate on test set | |
write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n") | |
test_by_transformer_with_multimodel(detector, test_samples, input_kind) | |
# Feature kind approach | |
sample_length = len(train_samples[0]["text"]) | |
if ( | |
sample_length == 2 | |
): # Check if the sample length is 2, indicating BART feature kind | |
feature_kind = BART | |
write_to_file(OUTPUT_FILE, "\nFeature kind = {feature_kind} \n") | |
# Train detector using feature kind | |
detector = train_by_feature_kind(train_samples, feature_kind) | |
# Evaluate on train set | |
write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n") | |
test_by_feature_kind(detector, train_samples, feature_kind) | |
# Evaluate on validation set | |
write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n") | |
test_by_feature_kind(detector, val_samples, feature_kind) | |
# Evaluate on test set | |
write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n") | |
test_by_feature_kind(detector, test_samples, feature_kind) | |
def process_multi_models_with_validation( | |
multimodel_csv_file, | |
train_indices, | |
test_indices, | |
num_samples, | |
): | |
""" | |
Processes multi-model data with validation, training, and testing. | |
Args: | |
multimodel_csv_file (str): Path to the CSV file containing | |
multi-model data. | |
train_indices (list): Indices for the training data. | |
test_indices (list): Indices for the testing data. | |
num_samples (int): Number of samples to process. | |
Returns: | |
None | |
""" | |
# Log the details of the process | |
write_to_file(OUTPUT_FILE, f"PROCESSING FILE={multimodel_csv_file} \n") | |
write_to_file(OUTPUT_FILE, f"EXPERIMENT WITH {MODEL_NAME} model \n") | |
write_to_file( | |
OUTPUT_FILE, | |
f"NUMBER OF MAX EPOCHS WITH EARLY STOPPING =\ | |
{NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING} \n", | |
) | |
write_to_file(OUTPUT_FILE, f"PATIENCE = {PATIENCE} \n") | |
write_to_file(OUTPUT_FILE, f"OPTIMIZED METRIC = {OPTIMIZED_METRIC} \n") | |
write_to_file(OUTPUT_FILE, f"BATCH SIZE = {BATCH_SIZE} \n") | |
write_to_file(OUTPUT_FILE, f"Number of samples = {num_samples} \n") | |
# Read multi-model data from the CSV file | |
data = parse_multimodal_data(multimodel_csv_file) | |
# Limit data to the specified number of samples | |
data = data[:num_samples] | |
# Distribute data into training, validation, and testing sets | |
train_samples, val_samples, test_samples = distribute_data( | |
data, | |
train_indices, | |
test_indices, | |
TRAIN_RATIO, | |
VAL_RATIO, | |
) | |
# Log the training and testing indices | |
write_to_file( | |
OUTPUT_FILE, | |
f"Multimodel training with train indices {train_indices},\ | |
test with test indices {test_indices} \n", | |
) | |
# Process the multi-models for training, validation, and testing | |
general_process_multimodels_train_val_test( | |
train_samples, | |
val_samples, | |
test_samples, | |
) | |
def split_train_val_test(data, train_ratio, val_ratio): | |
""" | |
Splits the dataset into training, validation, and test sets based on | |
specified ratios. | |
Args: | |
data (list): The dataset to be split. | |
train_ratio (float): The ratio of the dataset to be used for training. | |
val_ratio (float): The ratio of the dataset to be used for validation. | |
Returns: | |
tuple: A tuple containing three lists | |
(train_data, val_data, test_data). | |
""" | |
# Calculate the number of samples for the training set | |
num_train_samples = int(len(data) * train_ratio) | |
# Calculate the number of samples for the validation set | |
num_val_samples = int(len(data) * val_ratio) | |
# Split the data into training, validation, and test sets | |
train_data = data[:num_train_samples] | |
val_data = data[num_train_samples : (num_train_samples + num_val_samples)] | |
test_data = data[(num_train_samples + num_val_samples) :] | |
return train_data, val_data, test_data | |