File size: 2,709 Bytes
22e1b62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import argparse

from texts.config import CHATGPT
from texts.models import process_multi_models_with_validation
from texts.proofreading import generate_new_data_with_best_similarity
from texts.utils import generate_file_name


def main():
    """
    Main function to handle argument parsing and execute the sequence of
        operations including data generation and processing with multiple
        models.
    """
    parser = argparse.ArgumentParser(description="SimLLM.")

    # Argument for specifying the list of large language models
    parser.add_argument(
        "--LLMs",
        nargs="+",
        default=[CHATGPT, "Yi", "OpenChat"],
        help="List of large language models",
    )

    # Argument for specifying the list of training indexes
    parser.add_argument(
        "--train_indexes",
        type=int,
        default=[0, 1, 2],
        nargs="+",
        help="List of training indexes",
    )

    # Argument for specifying the list of testing indexes
    parser.add_argument(
        "--test_indexes",
        type=int,
        default=[0],
        nargs="+",
        help="List of testing indexes",
    )

    # Argument for specifying the number of samples
    parser.add_argument(
        "--num_samples",
        type=int,
        default=5000,
        help="Number of samples",
    )

    # Parse the command-line arguments
    args = parser.parse_args()

    # Static dataset parameters
    # dataset_name = "xsum"
    # column_name = "document"
    # num_samples = args.num_samples
    output_file = "data/human.csv"

    # Generate human data with shuffle
    # generate_human_with_shuffle(
    # dataset_name,
    # column_name,
    # num_samples,
    # output_file,
    # )

    # Existing data parameters
    existing_data_file = output_file
    existing_kinds = []

    # New kinds of models to generate data with
    new_kinds = args.LLMs

    # Generate new data with best similarity
    generate_new_data_with_best_similarity(
        existing_data_file,
        existing_kinds,
        new_kinds,
    )

    # Generate a filename for the multimodel CSV file
    multimodel_csv_file = generate_file_name(
        existing_data_file,
        existing_kinds,
        new_kinds,
    )

    # Number of samples to process (-1 means process all samples)
    num_samples_to_process = -1

    # Training and testing indexes from arguments
    training_indexes = args.train_indexes
    testing_indexes = args.test_indexes

    # Process multiple models with validation
    process_multi_models_with_validation(
        multimodel_csv_file,
        training_indexes,
        testing_indexes,
        num_samples_to_process,
    )


if __name__ == "__main__":
    main()