import argparse from texts.config import CHATGPT from texts.models import process_multi_models_with_validation from texts.proofreading import generate_new_data_with_best_similarity from texts.utils import generate_file_name def main(): """ Main function to handle argument parsing and execute the sequence of operations including data generation and processing with multiple models. """ parser = argparse.ArgumentParser(description="SimLLM.") # Argument for specifying the list of large language models parser.add_argument( "--LLMs", nargs="+", default=[CHATGPT, "Yi", "OpenChat"], help="List of large language models", ) # Argument for specifying the list of training indexes parser.add_argument( "--train_indexes", type=int, default=[0, 1, 2], nargs="+", help="List of training indexes", ) # Argument for specifying the list of testing indexes parser.add_argument( "--test_indexes", type=int, default=[0], nargs="+", help="List of testing indexes", ) # Argument for specifying the number of samples parser.add_argument( "--num_samples", type=int, default=5000, help="Number of samples", ) # Parse the command-line arguments args = parser.parse_args() # Static dataset parameters # dataset_name = "xsum" # column_name = "document" # num_samples = args.num_samples output_file = "data/human.csv" # Generate human data with shuffle # generate_human_with_shuffle( # dataset_name, # column_name, # num_samples, # output_file, # ) # Existing data parameters existing_data_file = output_file existing_kinds = [] # New kinds of models to generate data with new_kinds = args.LLMs # Generate new data with best similarity generate_new_data_with_best_similarity( existing_data_file, existing_kinds, new_kinds, ) # Generate a filename for the multimodel CSV file multimodel_csv_file = generate_file_name( existing_data_file, existing_kinds, new_kinds, ) # Number of samples to process (-1 means process all samples) num_samples_to_process = -1 # Training and testing indexes from arguments training_indexes = args.train_indexes testing_indexes = args.test_indexes # Process multiple models with validation process_multi_models_with_validation( multimodel_csv_file, training_indexes, testing_indexes, num_samples_to_process, ) if __name__ == "__main__": main()