File size: 26,343 Bytes
22e1b62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
import os
import shutil
from copy import deepcopy

import numpy as np
from config import (
    BART,
    BATCH_SIZE,
    HUMAN_LABEL,
    LEARNING_RATES,
    MACHINE_LABEL,
    MODEL_NAME,
    MULTIMODEL,
    NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING,
    OPTIMIZED_METRIC,
    PATIENCE,
    ROBERTA_MODEL_PATHS,
    SINGLE_FROM_MULTIMODEL,
    TRAIN_RATIO,
    VAL_RATIO,
    tokenizer,
)
from datasets import Dataset
from sklearn.base import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier
from transformers import (
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    Trainer,
    TrainerCallback,
    TrainingArguments,
)

from texts.bart_score import (
    bart_score_in_batch,
    extract_feature_in_batch,
)
from texts.config import OUTPUT_FILE
from texts.evaluation import compute_metrics
from texts.utils import (
    check_error,
    combine_text_with_BERT_format,
    parse_multimodal_data,
    write_to_file,
)


class TextDetector:
    def __init__(self) -> None:
        self.model = None
        self.multimodel = None
        self.train_data = None
        self.val_data = None
        self.test_data = None
        self.train_features = None
        self.val_features = None
        self.test_features

    def text_analysis(text: str) -> float:
        score = 0.0
        return score


class CustomCallback(TrainerCallback):
    """
    Custom callback to evaluate the training dataset at the end of each epoch.
    """

    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer

    def on_epoch_end(self, args, state, control, **kwargs):
        """
        At the end of each epoch, evaluate the training dataset.
        """
        if control.should_evaluate:
            control_copy = deepcopy(control)
            self._trainer.evaluate(
                eval_dataset=self._trainer.train_dataset,
                metric_key_prefix="train",
            )
            return control_copy


def abstract_train(features, labels):
    """
    Trains a model using the given features and labels.

    Args:
        features (list): The input features for training.
        labels (list): The target labels for training.

    Returns:
        object: The trained model.
    """
    model = MLPClassifier()
    model.fit(features, labels)
    return model


def evaluate_model(model, features, labels):
    """
    Evaluates the model's performance using accuracy and ROC AUC scores.

    Args:
        model (object): The trained model to evaluate.
        features (list): The input features for evaluation.
        labels (list): The target labels for evaluation.

    Returns:
        None
    """
    predictions = model.predict(features)
    rounded_predictions = [round(value) for value in predictions]

    accuracy = accuracy_score(labels, rounded_predictions)
    write_to_file(OUTPUT_FILE, f"Accuracy: {accuracy * 100.0:.1f}%\n")

    roc_auc = roc_auc_score(labels, rounded_predictions)
    write_to_file(OUTPUT_FILE, f"ROC AUC: {roc_auc * 100.0:.1f}%\n")


def preprocess_function_multimodel(sample):
    """
    Preprocesses a given sample for a multi-model setup by calculating
        BART scores and formatting the text for BERT input.

    Args:
        sample (dict): A dictionary containing a key "text", which is a list of
            lists of strings.

    Returns:
        dict: A dictionary containing tokenized and preprocessed text data.
    """
    num_texts = len(sample["text"][0])  # Number of texts in each sub-sample
    texts_grouped_by_index = [
        [] for _ in range(num_texts)
    ]  # Initialize empty lists for grouping texts by index

    # Group texts by their index across sub-samples
    for sub_sample in sample["text"]:
        for i in range(num_texts):
            texts_grouped_by_index[i].append(sub_sample[i])

    # Calculate BART scores for each text pair (text[0] with text[i])
    bart_scores = [
        bart_score_in_batch(
            texts_grouped_by_index[0],
            texts_grouped_by_index[i],
        )
        for i in range(1, num_texts)
    ]

    combined_texts = []

    # Process each sub-sample for BERT input
    for index, sub_sample in enumerate(sample["text"]):
        text_array = [sub_sample[0]]  # Start with the input text
        score_generation_pairs = []

        # Pair scores with their corresponding generations
        for i in range(1, num_texts):
            generation_text = sub_sample[i]
            generation_score = bart_scores[i - 1][index]
            score_generation_pairs.append((generation_score, generation_text))

        # Sort pairs by score in descending order
        sorted_pairs = sorted(score_generation_pairs, reverse=True)

        # Append sorted texts to text_array
        for _, sorted_text in sorted_pairs:
            text_array.append(sorted_text)

        # Combine texts into a single BERT-formatted string
        combined_text = combine_text_with_BERT_format(text_array)
        combined_texts.append(combined_text)

    # Tokenize the combined texts for BERT
    return tokenizer(combined_texts, add_special_tokens=False, truncation=True)


def preprocess_function_single_from_multimodel(sample):
    """
    Extracts the first text from each sub-sample in a multi-model sample and
        tokenizes it.

    Args:
        sample (dict): A dictionary containing a key "text", which is a list of
            lists of strings.

    Returns:
        dict: A dictionary containing tokenized text data.
    """
    combined_texts = []

    # Iterate through each sub-sample
    for sub_sample in sample["text"]:
        input_text = sub_sample[
            0
        ]  # Extract the first text from the sub-sample
        combined_texts.append(
            input_text,
        )  # Append it to the list of combined texts

    # Tokenize the combined texts
    return tokenizer(combined_texts, truncation=True)


def train_only_by_transformer_with_test_evaluation_early_stop(
    train_data,
    test_data,
    input_type,
    num_classes=2,
):
    """
    Trains a transformer model using the provided training and testing
        datasets with early stopping.

    Args:
        train_data (Dataset): The training dataset.
        test_data (Dataset): The testing dataset.
        input_type (str): The type of input data, either MULTIMODEL or
            SINGLE_FROM_MULTIMODEL.
        num_classes (int, optional): The number of classes for classification.
            Defaults to 2.

    Returns:
        Trainer: The trained model wrapped in a Trainer object.
    """
    # Preprocess datasets based on the input type
    if input_type == MULTIMODEL:
        train_data = train_data.map(
            preprocess_function_multimodel,
            batched=True,
        )
        test_data = test_data.map(preprocess_function_multimodel, batched=True)
    elif input_type == SINGLE_FROM_MULTIMODEL:
        train_data = train_data.map(
            preprocess_function_single_from_multimodel,
            batched=True,
        )
        test_data = test_data.map(
            preprocess_function_single_from_multimodel,
            batched=True,
        )

    # Data collator to pad inputs
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Load appropriate model based on number of classes
    if num_classes == 3:
        model = AutoModelForSequenceClassification.from_pretrained(
            "pretrained_model/roberta-base_num_labels_3",
            num_labels=num_classes,
        )
    else:
        model = AutoModelForSequenceClassification.from_pretrained(
            ROBERTA_MODEL_PATHS[MODEL_NAME],
            num_labels=num_classes,
        )

    learning_rate = LEARNING_RATES[MODEL_NAME]
    output_folder = "training_with_callbacks"

    # Remove the output folder if it already exists
    if os.path.exists(output_folder):
        shutil.rmtree(output_folder)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_folder,
        evaluation_strategy="epoch",
        logging_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING,
        weight_decay=0.01,
        push_to_hub=False,
        metric_for_best_model=OPTIMIZED_METRIC,
        load_best_model_at_end=True,
    )

    # Create Trainer object
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)],
    )

    # Add custom callback
    trainer.add_callback(CustomCallback(trainer))

    # Start training
    trainer.train()

    return trainer


def create_pair_sample(data_item, training_indices):
    """
    Creates pair samples for training by comparing human data with
        machine-generated data.

    Args:
        data_item (dict): A dictionary containing 'human', 'single',
            and 'pair' data.
        training_indices (list): A list of indices used for training.

    Returns:
        list: A list of dictionaries, each containing a 'text' array
            and a 'label'.
    """
    # Initialize the result list
    result_samples = []

    # Check if there is any error in the data_item
    if check_error(data_item):
        return result_samples

    # Create machine samples
    for train_idx in training_indices:
        if data_item["human"] != data_item["single"][train_idx]:
            text_array = []
            machine_text = data_item["single"][train_idx]
            text_array.append(machine_text)

            for sub_idx in training_indices:
                text_array.append(data_item["pair"][train_idx][sub_idx])

            sample = {
                "text": text_array,
                "label": MACHINE_LABEL,
            }
            result_samples.append(sample)

    # Create human samples
    text_array = [data_item["human"]]

    for train_idx in training_indices:
        text_array.append(data_item["single"][train_idx])

    human_sample = {
        "text": text_array,
        "label": HUMAN_LABEL,
    }

    # Append human samples for each machine sample
    num_machine_samples = len(result_samples)
    for _ in range(num_machine_samples):
        result_samples.append(human_sample)

    return result_samples


def create_pair_test_sample(data_item, training_indices, testing_indices):
    """
    Creates pair test samples by comparing human data with
        machine-generated data.

    Args:
        data_item (dict): A dictionary containing 'human', 'single', and
            'pair' data.
        training_indices (list): A list of indices used for training.
        testing_indices (list): A list of indices used for testing.

    Returns:
        list: A list of dictionaries, each containing a 'text' array and a
            'label'.
    """
    # Initialize the result list
    result_samples = []

    # Check if there is any error in the data_item
    if check_error(data_item):
        return result_samples

    # Create machine samples based on testing indices
    for test_idx in testing_indices:
        if data_item["human"] != data_item["single"][test_idx]:
            text_array = []
            machine_text = data_item["single"][test_idx]
            text_array.append(machine_text)

            for train_idx in training_indices:
                text_array.append(data_item["pair"][test_idx][train_idx])

            sample = {
                "text": text_array,
                "label": MACHINE_LABEL,
            }
            result_samples.append(sample)

    # Create human sample
    text_array = [data_item["human"]]

    for train_idx in training_indices:
        text_array.append(data_item["single"][train_idx])

    human_sample = {
        "text": text_array,
        "label": HUMAN_LABEL,
    }

    # Append the human sample for each machine sample
    num_machine_samples = len(result_samples)
    for _ in range(num_machine_samples):
        result_samples.append(human_sample)

    return result_samples


def create_train_val_sample(data, training_indices):
    """
    Creates training and validation samples from the provided data.

    Args:
        data (list): A list of data items, each to be processed.
        training_indices (list): A list of indices used for training.

    Returns:
        list: A list of training and validation samples created from the data.
    """
    # Initialize the result list
    result_samples = []

    # Process each item in the data
    for data_item in data:
        # Create pair samples for the current item
        sub_samples = create_pair_sample(data_item, training_indices)

        # Extend the result list with the created sub-samples
        result_samples.extend(sub_samples)

    return result_samples


def create_test_sample(data, training_indices, testing_indices):
    """
    Creates test samples from the provided data by comparing human data with
        machine-generated data.

    Args:
        data (list): A list of data items, each to be processed.
        training_indices (list): A list of indices used for training.
        testing_indices (list): A list of indices used for testing.

    Returns:
        list: A list of test samples created from the data.
    """
    # Initialize the result list
    result_samples = []

    # Process each item in the data
    for data_item in data:
        # Create pair test samples for the current item
        sub_samples = create_pair_test_sample(
            data_item,
            training_indices,
            testing_indices,
        )

        # Extend the result list with the created sub-samples
        result_samples.extend(sub_samples)

    return result_samples


def distribute_data(data, train_indices, test_indices, train_ratio, val_ratio):
    """
    Distributes the data into training, validation, and test samples.

    Args:
        data (list): A list of data items to be split and processed.
        train_indices (list): A list of indices used for training.
        test_indices (list): A list of indices used for testing.
        train_ratio (float): The ratio of data to be used for training.
        val_ratio (float): The ratio of data to be used for validation.

    Returns:
        tuple: A tuple containing lists of training, validation,
            and test samples.
    """
    # Split the data into training, validation, and test sets
    train_data, val_data, test_data = split_train_val_test(
        data,
        train_ratio,
        val_ratio,
    )

    # Create training samples
    train_samples = create_train_val_sample(train_data, train_indices)
    write_to_file(OUTPUT_FILE, f"train samples = {len(train_samples)}\n")

    # Create validation samples
    val_samples = create_train_val_sample(val_data, train_indices)
    write_to_file(OUTPUT_FILE, f"val samples = {len(val_samples)}\n")

    # Create test samples
    test_samples = create_test_sample(test_data, train_indices, test_indices)
    write_to_file(OUTPUT_FILE, f"test samples = {len(test_samples)}\n")

    return train_samples, val_samples, test_samples


def convert_to_huggingface_with_multimodel(samples):
    """
    Converts a list of samples to the Hugging Face Dataset format.

    Args:
        samples (list): A list of samples to be converted.

    Returns:
        Dataset: A Hugging Face Dataset object created from the samples.
    """
    return Dataset.from_list(samples)


def train_by_transformer_with_multimodel_and_early_stop(
    train_samples,
    val_samples,
    input_type,
):
    """
    Trains a transformer model with multimodal data and early stopping.

    Args:
        train_samples (list): A list of training samples.
        val_samples (list): A list of validation samples.
        input_type (str): The type of input data (e.g., multimodal).

    Returns:
        object: The trained model with early stopping.
    """
    # Convert training and validation samples to Hugging Face Dataset format
    train_data = convert_to_huggingface_with_multimodel(train_samples)
    val_data = convert_to_huggingface_with_multimodel(val_samples)

    # Train the model with early stopping and return the trained model
    return train_only_by_transformer_with_test_evaluation_early_stop(
        train_data,
        val_data,
        input_type,
    )


def test_by_transformer_with_multimodel(detector, test_samples, input_type):
    """
    Tests a trained transformer model with multimodal data.

    Args:
        detector (object): The trained model to be evaluated.
        test_samples (list): A list of test samples.
        input_type (str): The type of input data (e.g., multimodal).

    Returns:
        None
    """
    # Convert test samples to Hugging Face Dataset format
    test_data = convert_to_huggingface_with_multimodel(test_samples)

    # Apply the appropriate preprocessing function based on the input type
    if input_type == MULTIMODEL:
        test_data = test_data.map(preprocess_function_multimodel, batched=True)
    elif input_type == SINGLE_FROM_MULTIMODEL:
        test_data = test_data.map(
            preprocess_function_single_from_multimodel,
            batched=True,
        )

    # Evaluate the model on the test data
    result = detector.evaluate(eval_dataset=test_data)

    # Extract and log the ROC AUC score
    roc_auc = result["eval_roc_auc"]
    write_to_file(OUTPUT_FILE, "roc_auc: %.1f%%" % (roc_auc * 100.0) + "\n")


def extract_by_feature_kind(samples, feature_type):
    """
    Extracts features from the given samples based on the specified feature
        type.

    Args:
        samples (list): A list of samples where each sample is a dictionary
            with 'text' and 'label' keys.
        feature_type (str): The type of feature to extract.

    Returns:
        tuple: A tuple containing the extracted features and corresponding
            labels.
    """
    text_1_list = []
    text_2_list = []
    labels = []

    for sample in samples:
        text_1_list.append(sample["text"][0])
        text_2_list.append(sample["text"][1])
        labels.append(sample["label"])

    # Extract features in batch based on the feature type
    features = extract_feature_in_batch(text_1_list, text_2_list, feature_type)

    return features, labels


def train_by_feature_kind(train_samples, feature_type):
    """
    Trains a model using features extracted from the training samples based on
        the specified feature type.

    Args:
        train_samples (list): A list of training samples where each sample is
            a dictionary with 'text' and 'label' keys.
        feature_type (str): The type of feature to extract for training.

    Returns:
        object: The trained model.
    """
    # Extract features and labels from the training samples
    features, labels = extract_by_feature_kind(train_samples, feature_type)

    # Convert features to a numpy array and reshape for training
    features = np.array(features)
    features = features.reshape(-1, 1)

    # Train the model using the extracted features and labels
    model = abstract_train(features, labels)

    return model


def test_by_feature_kind(detector, samples, feature_type):
    """
    Tests a detector using features extracted from the provided samples based
        on the specified feature type.

    Args:
        detector (object): The detector model to be evaluated.
        samples (list): A list of samples where each sample is a dictionary
            with 'text' and 'label' keys.
        feature_type (str): The type of feature to extract for testing.

    Returns:
        None
    """
    # Extract features and labels from the samples
    features, labels = extract_by_feature_kind(samples, feature_type)

    # Convert features to a numpy array and reshape for evaluation
    features = np.array(features)
    features = features.reshape(-1, 1)

    # Evaluate the detector model using the extracted features and labels
    evaluate_model(detector, features, labels)


def general_process_multimodels_train_val_test(
    train_samples,
    val_samples,
    test_samples,
):
    """
    General process for training, validating, and testing models using
        multi-model and feature kind approaches.

    Args:
        train_samples (list): Training samples.
        val_samples (list): Validation samples.
        test_samples (list): Test samples.

    Returns:
        None
    """
    # Multi-model approach
    input_kind = MULTIMODEL
    write_to_file(OUTPUT_FILE, "\nInput kind = {input_kind} \n")

    # Train detector using multi-model with early stopping
    detector = train_by_transformer_with_multimodel_and_early_stop(
        train_samples,
        val_samples,
        input_kind,
    )

    # Evaluate on train set
    write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n")
    test_by_transformer_with_multimodel(detector, train_samples, input_kind)

    # Evaluate on validation set
    write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n")
    test_by_transformer_with_multimodel(detector, val_samples, input_kind)

    # Evaluate on test set
    write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n")
    test_by_transformer_with_multimodel(detector, test_samples, input_kind)

    # Single from multi-model approach
    input_kind = SINGLE_FROM_MULTIMODEL
    write_to_file(OUTPUT_FILE, "\nInput kind = {input_kind} \n")

    # Train detector using single from multi-model with early stopping
    detector = train_by_transformer_with_multimodel_and_early_stop(
        train_samples,
        val_samples,
        input_kind,
    )

    # Evaluate on train set
    write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n")
    test_by_transformer_with_multimodel(detector, train_samples, input_kind)

    # Evaluate on validation set
    write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n")
    test_by_transformer_with_multimodel(detector, val_samples, input_kind)

    # Evaluate on test set
    write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n")
    test_by_transformer_with_multimodel(detector, test_samples, input_kind)

    # Feature kind approach
    sample_length = len(train_samples[0]["text"])
    if (
        sample_length == 2
    ):  # Check if the sample length is 2, indicating BART feature kind
        feature_kind = BART
        write_to_file(OUTPUT_FILE, "\nFeature kind = {feature_kind} \n")

        # Train detector using feature kind
        detector = train_by_feature_kind(train_samples, feature_kind)

        # Evaluate on train set
        write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n")
        test_by_feature_kind(detector, train_samples, feature_kind)

        # Evaluate on validation set
        write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n")
        test_by_feature_kind(detector, val_samples, feature_kind)

        # Evaluate on test set
        write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n")
        test_by_feature_kind(detector, test_samples, feature_kind)


def process_multi_models_with_validation(
    multimodel_csv_file,
    train_indices,
    test_indices,
    num_samples,
):
    """
    Processes multi-model data with validation, training, and testing.

    Args:
        multimodel_csv_file (str): Path to the CSV file containing
            multi-model data.
        train_indices (list): Indices for the training data.
        test_indices (list): Indices for the testing data.
        num_samples (int): Number of samples to process.

    Returns:
        None
    """
    # Log the details of the process
    write_to_file(OUTPUT_FILE, f"PROCESSING FILE={multimodel_csv_file} \n")
    write_to_file(OUTPUT_FILE, f"EXPERIMENT WITH {MODEL_NAME} model \n")
    write_to_file(
        OUTPUT_FILE,
        f"NUMBER OF MAX EPOCHS WITH EARLY STOPPING =\
            {NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING} \n",
    )
    write_to_file(OUTPUT_FILE, f"PATIENCE = {PATIENCE} \n")
    write_to_file(OUTPUT_FILE, f"OPTIMIZED METRIC = {OPTIMIZED_METRIC} \n")
    write_to_file(OUTPUT_FILE, f"BATCH SIZE = {BATCH_SIZE} \n")
    write_to_file(OUTPUT_FILE, f"Number of samples = {num_samples} \n")

    # Read multi-model data from the CSV file
    data = parse_multimodal_data(multimodel_csv_file)

    # Limit data to the specified number of samples
    data = data[:num_samples]

    # Distribute data into training, validation, and testing sets
    train_samples, val_samples, test_samples = distribute_data(
        data,
        train_indices,
        test_indices,
        TRAIN_RATIO,
        VAL_RATIO,
    )

    # Log the training and testing indices
    write_to_file(
        OUTPUT_FILE,
        f"Multimodel training with train indices {train_indices},\
            test with test indices {test_indices} \n",
    )

    # Process the multi-models for training, validation, and testing
    general_process_multimodels_train_val_test(
        train_samples,
        val_samples,
        test_samples,
    )


def split_train_val_test(data, train_ratio, val_ratio):
    """
    Splits the dataset into training, validation, and test sets based on
        specified ratios.

    Args:
        data (list): The dataset to be split.
        train_ratio (float): The ratio of the dataset to be used for training.
        val_ratio (float): The ratio of the dataset to be used for validation.

    Returns:
        tuple: A tuple containing three lists
            (train_data, val_data, test_data).
    """
    # Calculate the number of samples for the training set
    num_train_samples = int(len(data) * train_ratio)

    # Calculate the number of samples for the validation set
    num_val_samples = int(len(data) * val_ratio)

    # Split the data into training, validation, and test sets
    train_data = data[:num_train_samples]
    val_data = data[num_train_samples : (num_train_samples + num_val_samples)]
    test_data = data[(num_train_samples + num_val_samples) :]

    return train_data, val_data, test_data