Spaces:

pmkhanh7890
/

news_verification

Sleeping

File size: 2,709 Bytes

22e1b62

import argparse

from texts.config import CHATGPT
from texts.models import process_multi_models_with_validation
from texts.proofreading import generate_new_data_with_best_similarity
from texts.utils import generate_file_name


def main():
    """
    Main function to handle argument parsing and execute the sequence of
        operations including data generation and processing with multiple
        models.
    """
    parser = argparse.ArgumentParser(description="SimLLM.")

    # Argument for specifying the list of large language models
    parser.add_argument(
        "--LLMs",
        nargs="+",
        default=[CHATGPT, "Yi", "OpenChat"],
        help="List of large language models",
    )

    # Argument for specifying the list of training indexes
    parser.add_argument(
        "--train_indexes",
        type=int,
        default=[0, 1, 2],
        nargs="+",
        help="List of training indexes",
    )

    # Argument for specifying the list of testing indexes
    parser.add_argument(
        "--test_indexes",
        type=int,
        default=[0],
        nargs="+",
        help="List of testing indexes",
    )

    # Argument for specifying the number of samples
    parser.add_argument(
        "--num_samples",
        type=int,
        default=5000,
        help="Number of samples",
    )

    # Parse the command-line arguments
    args = parser.parse_args()

    # Static dataset parameters
    # dataset_name = "xsum"
    # column_name = "document"
    # num_samples = args.num_samples
    output_file = "data/human.csv"

    # Generate human data with shuffle
    # generate_human_with_shuffle(
    # dataset_name,
    # column_name,
    # num_samples,
    # output_file,
    # )

    # Existing data parameters
    existing_data_file = output_file
    existing_kinds = []

    # New kinds of models to generate data with
    new_kinds = args.LLMs

    # Generate new data with best similarity
    generate_new_data_with_best_similarity(
        existing_data_file,
        existing_kinds,
        new_kinds,
    )

    # Generate a filename for the multimodel CSV file
    multimodel_csv_file = generate_file_name(
        existing_data_file,
        existing_kinds,
        new_kinds,
    )

    # Number of samples to process (-1 means process all samples)
    num_samples_to_process = -1

    # Training and testing indexes from arguments
    training_indexes = args.train_indexes
    testing_indexes = args.test_indexes

    # Process multiple models with validation
    process_multi_models_with_validation(
        multimodel_csv_file,
        training_indexes,
        testing_indexes,
        num_samples_to_process,
    )


if __name__ == "__main__":
    main()