Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

news_verification / src /texts /main_text.py

pmkhanh7890

1st

22e1b62 4 months ago

raw

history blame

2.71 kB

	import argparse

	from texts.config import CHATGPT
	from texts.models import process_multi_models_with_validation
	from texts.proofreading import generate_new_data_with_best_similarity
	from texts.utils import generate_file_name


	def main():
	"""
	Main function to handle argument parsing and execute the sequence of
	operations including data generation and processing with multiple
	models.
	"""
	parser = argparse.ArgumentParser(description="SimLLM.")

	# Argument for specifying the list of large language models
	parser.add_argument(
	"--LLMs",
	nargs="+",
	default=[CHATGPT, "Yi", "OpenChat"],
	help="List of large language models",
	)

	# Argument for specifying the list of training indexes
	parser.add_argument(
	"--train_indexes",
	type=int,
	default=[0, 1, 2],
	nargs="+",
	help="List of training indexes",
	)

	# Argument for specifying the list of testing indexes
	parser.add_argument(
	"--test_indexes",
	type=int,
	default=[0],
	nargs="+",
	help="List of testing indexes",
	)

	# Argument for specifying the number of samples
	parser.add_argument(
	"--num_samples",
	type=int,
	default=5000,
	help="Number of samples",
	)

	# Parse the command-line arguments
	args = parser.parse_args()

	# Static dataset parameters
	# dataset_name = "xsum"
	# column_name = "document"
	# num_samples = args.num_samples
	output_file = "data/human.csv"

	# Generate human data with shuffle
	# generate_human_with_shuffle(
	# dataset_name,
	# column_name,
	# num_samples,
	# output_file,
	# )

	# Existing data parameters
	existing_data_file = output_file
	existing_kinds = []

	# New kinds of models to generate data with
	new_kinds = args.LLMs

	# Generate new data with best similarity
	generate_new_data_with_best_similarity(
	existing_data_file,
	existing_kinds,
	new_kinds,
	)

	# Generate a filename for the multimodel CSV file
	multimodel_csv_file = generate_file_name(
	existing_data_file,
	existing_kinds,
	new_kinds,
	)

	# Number of samples to process (-1 means process all samples)
	num_samples_to_process = -1

	# Training and testing indexes from arguments
	training_indexes = args.train_indexes
	testing_indexes = args.test_indexes

	# Process multiple models with validation
	process_multi_models_with_validation(
	multimodel_csv_file,
	training_indexes,
	testing_indexes,
	num_samples_to_process,
	)


	if __name__ == "__main__":
	main()