Spaces:
Sleeping
Sleeping
import argparse | |
from texts.config import CHATGPT | |
from texts.models import process_multi_models_with_validation | |
from texts.proofreading import generate_new_data_with_best_similarity | |
from texts.utils import generate_file_name | |
def main(): | |
""" | |
Main function to handle argument parsing and execute the sequence of | |
operations including data generation and processing with multiple | |
models. | |
""" | |
parser = argparse.ArgumentParser(description="SimLLM.") | |
# Argument for specifying the list of large language models | |
parser.add_argument( | |
"--LLMs", | |
nargs="+", | |
default=[CHATGPT, "Yi", "OpenChat"], | |
help="List of large language models", | |
) | |
# Argument for specifying the list of training indexes | |
parser.add_argument( | |
"--train_indexes", | |
type=int, | |
default=[0, 1, 2], | |
nargs="+", | |
help="List of training indexes", | |
) | |
# Argument for specifying the list of testing indexes | |
parser.add_argument( | |
"--test_indexes", | |
type=int, | |
default=[0], | |
nargs="+", | |
help="List of testing indexes", | |
) | |
# Argument for specifying the number of samples | |
parser.add_argument( | |
"--num_samples", | |
type=int, | |
default=5000, | |
help="Number of samples", | |
) | |
# Parse the command-line arguments | |
args = parser.parse_args() | |
# Static dataset parameters | |
# dataset_name = "xsum" | |
# column_name = "document" | |
# num_samples = args.num_samples | |
output_file = "data/human.csv" | |
# Generate human data with shuffle | |
# generate_human_with_shuffle( | |
# dataset_name, | |
# column_name, | |
# num_samples, | |
# output_file, | |
# ) | |
# Existing data parameters | |
existing_data_file = output_file | |
existing_kinds = [] | |
# New kinds of models to generate data with | |
new_kinds = args.LLMs | |
# Generate new data with best similarity | |
generate_new_data_with_best_similarity( | |
existing_data_file, | |
existing_kinds, | |
new_kinds, | |
) | |
# Generate a filename for the multimodel CSV file | |
multimodel_csv_file = generate_file_name( | |
existing_data_file, | |
existing_kinds, | |
new_kinds, | |
) | |
# Number of samples to process (-1 means process all samples) | |
num_samples_to_process = -1 | |
# Training and testing indexes from arguments | |
training_indexes = args.train_indexes | |
testing_indexes = args.test_indexes | |
# Process multiple models with validation | |
process_multi_models_with_validation( | |
multimodel_csv_file, | |
training_indexes, | |
testing_indexes, | |
num_samples_to_process, | |
) | |
if __name__ == "__main__": | |
main() | |