Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

news_verification / src /texts /SimLLM /Refactor /models.py

pmkhanh7890

1st

22e1b62 4 months ago

raw

history blame

26.3 kB

	import os
	import shutil
	from copy import deepcopy

	import numpy as np
	from config import (
	BART,
	BATCH_SIZE,
	HUMAN_LABEL,
	LEARNING_RATES,
	MACHINE_LABEL,
	MODEL_NAME,
	MULTIMODEL,
	NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING,
	OPTIMIZED_METRIC,
	PATIENCE,
	ROBERTA_MODEL_PATHS,
	SINGLE_FROM_MULTIMODEL,
	TRAIN_RATIO,
	VAL_RATIO,
	tokenizer,
	)
	from datasets import Dataset
	from sklearn.base import accuracy_score
	from sklearn.metrics import roc_auc_score
	from sklearn.neural_network import MLPClassifier
	from transformers import (
	AutoModelForSequenceClassification,
	DataCollatorWithPadding,
	EarlyStoppingCallback,
	Trainer,
	TrainerCallback,
	TrainingArguments,
	)

	from texts.bart_score import (
	bart_score_in_batch,
	extract_feature_in_batch,
	)
	from texts.config import OUTPUT_FILE
	from texts.evaluation import compute_metrics
	from texts.utils import (
	check_error,
	combine_text_with_BERT_format,
	parse_multimodal_data,
	write_to_file,
	)


	class TextDetector:
	def __init__(self) -> None:
	self.model = None
	self.multimodel = None
	self.train_data = None
	self.val_data = None
	self.test_data = None
	self.train_features = None
	self.val_features = None
	self.test_features

	def text_analysis(text: str) -> float:
	score = 0.0
	return score


	class CustomCallback(TrainerCallback):
	"""
	Custom callback to evaluate the training dataset at the end of each epoch.
	"""

	def __init__(self, trainer) -> None:
	super().__init__()
	self._trainer = trainer

	def on_epoch_end(self, args, state, control, **kwargs):
	"""
	At the end of each epoch, evaluate the training dataset.
	"""
	if control.should_evaluate:
	control_copy = deepcopy(control)
	self._trainer.evaluate(
	eval_dataset=self._trainer.train_dataset,
	metric_key_prefix="train",
	)
	return control_copy


	def abstract_train(features, labels):
	"""
	Trains a model using the given features and labels.

	Args:
	features (list): The input features for training.
	labels (list): The target labels for training.

	Returns:
	object: The trained model.
	"""
	model = MLPClassifier()
	model.fit(features, labels)
	return model


	def evaluate_model(model, features, labels):
	"""
	Evaluates the model's performance using accuracy and ROC AUC scores.

	Args:
	model (object): The trained model to evaluate.
	features (list): The input features for evaluation.
	labels (list): The target labels for evaluation.

	Returns:
	None
	"""
	predictions = model.predict(features)
	rounded_predictions = [round(value) for value in predictions]

	accuracy = accuracy_score(labels, rounded_predictions)
	write_to_file(OUTPUT_FILE, f"Accuracy: {accuracy * 100.0:.1f}%\n")

	roc_auc = roc_auc_score(labels, rounded_predictions)
	write_to_file(OUTPUT_FILE, f"ROC AUC: {roc_auc * 100.0:.1f}%\n")


	def preprocess_function_multimodel(sample):
	"""
	Preprocesses a given sample for a multi-model setup by calculating
	BART scores and formatting the text for BERT input.

	Args:
	sample (dict): A dictionary containing a key "text", which is a list of
	lists of strings.

	Returns:
	dict: A dictionary containing tokenized and preprocessed text data.
	"""
	num_texts = len(sample["text"][0]) # Number of texts in each sub-sample
	texts_grouped_by_index = [
	[] for _ in range(num_texts)
	] # Initialize empty lists for grouping texts by index

	# Group texts by their index across sub-samples
	for sub_sample in sample["text"]:
	for i in range(num_texts):
	texts_grouped_by_index[i].append(sub_sample[i])

	# Calculate BART scores for each text pair (text[0] with text[i])
	bart_scores = [
	bart_score_in_batch(
	texts_grouped_by_index[0],
	texts_grouped_by_index[i],
	)
	for i in range(1, num_texts)
	]

	combined_texts = []

	# Process each sub-sample for BERT input
	for index, sub_sample in enumerate(sample["text"]):
	text_array = [sub_sample[0]] # Start with the input text
	score_generation_pairs = []

	# Pair scores with their corresponding generations
	for i in range(1, num_texts):
	generation_text = sub_sample[i]
	generation_score = bart_scores[i - 1][index]
	score_generation_pairs.append((generation_score, generation_text))

	# Sort pairs by score in descending order
	sorted_pairs = sorted(score_generation_pairs, reverse=True)

	# Append sorted texts to text_array
	for _, sorted_text in sorted_pairs:
	text_array.append(sorted_text)

	# Combine texts into a single BERT-formatted string
	combined_text = combine_text_with_BERT_format(text_array)
	combined_texts.append(combined_text)

	# Tokenize the combined texts for BERT
	return tokenizer(combined_texts, add_special_tokens=False, truncation=True)


	def preprocess_function_single_from_multimodel(sample):
	"""
	Extracts the first text from each sub-sample in a multi-model sample and
	tokenizes it.

	Args:
	sample (dict): A dictionary containing a key "text", which is a list of
	lists of strings.

	Returns:
	dict: A dictionary containing tokenized text data.
	"""
	combined_texts = []

	# Iterate through each sub-sample
	for sub_sample in sample["text"]:
	input_text = sub_sample[
	0
	] # Extract the first text from the sub-sample
	combined_texts.append(
	input_text,
	) # Append it to the list of combined texts

	# Tokenize the combined texts
	return tokenizer(combined_texts, truncation=True)


	def train_only_by_transformer_with_test_evaluation_early_stop(
	train_data,
	test_data,
	input_type,
	num_classes=2,
	):
	"""
	Trains a transformer model using the provided training and testing
	datasets with early stopping.

	Args:
	train_data (Dataset): The training dataset.
	test_data (Dataset): The testing dataset.
	input_type (str): The type of input data, either MULTIMODEL or
	SINGLE_FROM_MULTIMODEL.
	num_classes (int, optional): The number of classes for classification.
	Defaults to 2.

	Returns:
	Trainer: The trained model wrapped in a Trainer object.
	"""
	# Preprocess datasets based on the input type
	if input_type == MULTIMODEL:
	train_data = train_data.map(
	preprocess_function_multimodel,
	batched=True,
	)
	test_data = test_data.map(preprocess_function_multimodel, batched=True)
	elif input_type == SINGLE_FROM_MULTIMODEL:
	train_data = train_data.map(
	preprocess_function_single_from_multimodel,
	batched=True,
	)
	test_data = test_data.map(
	preprocess_function_single_from_multimodel,
	batched=True,
	)

	# Data collator to pad inputs
	data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

	# Load appropriate model based on number of classes
	if num_classes == 3:
	model = AutoModelForSequenceClassification.from_pretrained(
	"pretrained_model/roberta-base_num_labels_3",
	num_labels=num_classes,
	)
	else:
	model = AutoModelForSequenceClassification.from_pretrained(
	ROBERTA_MODEL_PATHS[MODEL_NAME],
	num_labels=num_classes,
	)

	learning_rate = LEARNING_RATES[MODEL_NAME]
	output_folder = "training_with_callbacks"

	# Remove the output folder if it already exists
	if os.path.exists(output_folder):
	shutil.rmtree(output_folder)

	# Training arguments
	training_args = TrainingArguments(
	output_dir=output_folder,
	evaluation_strategy="epoch",
	logging_strategy="epoch",
	save_strategy="epoch",
	learning_rate=learning_rate,
	per_device_train_batch_size=BATCH_SIZE,
	per_device_eval_batch_size=BATCH_SIZE,
	num_train_epochs=NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING,
	weight_decay=0.01,
	push_to_hub=False,
	metric_for_best_model=OPTIMIZED_METRIC,
	load_best_model_at_end=True,
	)

	# Create Trainer object
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_data,
	eval_dataset=test_data,
	tokenizer=tokenizer,
	data_collator=data_collator,
	compute_metrics=compute_metrics,
	callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)],
	)

	# Add custom callback
	trainer.add_callback(CustomCallback(trainer))

	# Start training
	trainer.train()

	return trainer


	def create_pair_sample(data_item, training_indices):
	"""
	Creates pair samples for training by comparing human data with
	machine-generated data.

	Args:
	data_item (dict): A dictionary containing 'human', 'single',
	and 'pair' data.
	training_indices (list): A list of indices used for training.

	Returns:
	list: A list of dictionaries, each containing a 'text' array
	and a 'label'.
	"""
	# Initialize the result list
	result_samples = []

	# Check if there is any error in the data_item
	if check_error(data_item):
	return result_samples

	# Create machine samples
	for train_idx in training_indices:
	if data_item["human"] != data_item["single"][train_idx]:
	text_array = []
	machine_text = data_item["single"][train_idx]
	text_array.append(machine_text)

	for sub_idx in training_indices:
	text_array.append(data_item["pair"][train_idx][sub_idx])

	sample = {
	"text": text_array,
	"label": MACHINE_LABEL,
	}
	result_samples.append(sample)

	# Create human samples
	text_array = [data_item["human"]]

	for train_idx in training_indices:
	text_array.append(data_item["single"][train_idx])

	human_sample = {
	"text": text_array,
	"label": HUMAN_LABEL,
	}

	# Append human samples for each machine sample
	num_machine_samples = len(result_samples)
	for _ in range(num_machine_samples):
	result_samples.append(human_sample)

	return result_samples


	def create_pair_test_sample(data_item, training_indices, testing_indices):
	"""
	Creates pair test samples by comparing human data with
	machine-generated data.

	Args:
	data_item (dict): A dictionary containing 'human', 'single', and
	'pair' data.
	training_indices (list): A list of indices used for training.
	testing_indices (list): A list of indices used for testing.

	Returns:
	list: A list of dictionaries, each containing a 'text' array and a
	'label'.
	"""
	# Initialize the result list
	result_samples = []

	# Check if there is any error in the data_item
	if check_error(data_item):
	return result_samples

	# Create machine samples based on testing indices
	for test_idx in testing_indices:
	if data_item["human"] != data_item["single"][test_idx]:
	text_array = []
	machine_text = data_item["single"][test_idx]
	text_array.append(machine_text)

	for train_idx in training_indices:
	text_array.append(data_item["pair"][test_idx][train_idx])

	sample = {
	"text": text_array,
	"label": MACHINE_LABEL,
	}
	result_samples.append(sample)

	# Create human sample
	text_array = [data_item["human"]]

	for train_idx in training_indices:
	text_array.append(data_item["single"][train_idx])

	human_sample = {
	"text": text_array,
	"label": HUMAN_LABEL,
	}

	# Append the human sample for each machine sample
	num_machine_samples = len(result_samples)
	for _ in range(num_machine_samples):
	result_samples.append(human_sample)

	return result_samples


	def create_train_val_sample(data, training_indices):
	"""
	Creates training and validation samples from the provided data.

	Args:
	data (list): A list of data items, each to be processed.
	training_indices (list): A list of indices used for training.

	Returns:
	list: A list of training and validation samples created from the data.
	"""
	# Initialize the result list
	result_samples = []

	# Process each item in the data
	for data_item in data:
	# Create pair samples for the current item
	sub_samples = create_pair_sample(data_item, training_indices)

	# Extend the result list with the created sub-samples
	result_samples.extend(sub_samples)

	return result_samples


	def create_test_sample(data, training_indices, testing_indices):
	"""
	Creates test samples from the provided data by comparing human data with
	machine-generated data.

	Args:
	data (list): A list of data items, each to be processed.
	training_indices (list): A list of indices used for training.
	testing_indices (list): A list of indices used for testing.

	Returns:
	list: A list of test samples created from the data.
	"""
	# Initialize the result list
	result_samples = []

	# Process each item in the data
	for data_item in data:
	# Create pair test samples for the current item
	sub_samples = create_pair_test_sample(
	data_item,
	training_indices,
	testing_indices,
	)

	# Extend the result list with the created sub-samples
	result_samples.extend(sub_samples)

	return result_samples


	def distribute_data(data, train_indices, test_indices, train_ratio, val_ratio):
	"""
	Distributes the data into training, validation, and test samples.

	Args:
	data (list): A list of data items to be split and processed.
	train_indices (list): A list of indices used for training.
	test_indices (list): A list of indices used for testing.
	train_ratio (float): The ratio of data to be used for training.
	val_ratio (float): The ratio of data to be used for validation.

	Returns:
	tuple: A tuple containing lists of training, validation,
	and test samples.
	"""
	# Split the data into training, validation, and test sets
	train_data, val_data, test_data = split_train_val_test(
	data,
	train_ratio,
	val_ratio,
	)

	# Create training samples
	train_samples = create_train_val_sample(train_data, train_indices)
	write_to_file(OUTPUT_FILE, f"train samples = {len(train_samples)}\n")

	# Create validation samples
	val_samples = create_train_val_sample(val_data, train_indices)
	write_to_file(OUTPUT_FILE, f"val samples = {len(val_samples)}\n")

	# Create test samples
	test_samples = create_test_sample(test_data, train_indices, test_indices)
	write_to_file(OUTPUT_FILE, f"test samples = {len(test_samples)}\n")

	return train_samples, val_samples, test_samples


	def convert_to_huggingface_with_multimodel(samples):
	"""
	Converts a list of samples to the Hugging Face Dataset format.

	Args:
	samples (list): A list of samples to be converted.

	Returns:
	Dataset: A Hugging Face Dataset object created from the samples.
	"""
	return Dataset.from_list(samples)


	def train_by_transformer_with_multimodel_and_early_stop(
	train_samples,
	val_samples,
	input_type,
	):
	"""
	Trains a transformer model with multimodal data and early stopping.

	Args:
	train_samples (list): A list of training samples.
	val_samples (list): A list of validation samples.
	input_type (str): The type of input data (e.g., multimodal).

	Returns:
	object: The trained model with early stopping.
	"""
	# Convert training and validation samples to Hugging Face Dataset format
	train_data = convert_to_huggingface_with_multimodel(train_samples)
	val_data = convert_to_huggingface_with_multimodel(val_samples)

	# Train the model with early stopping and return the trained model
	return train_only_by_transformer_with_test_evaluation_early_stop(
	train_data,
	val_data,
	input_type,
	)


	def test_by_transformer_with_multimodel(detector, test_samples, input_type):
	"""
	Tests a trained transformer model with multimodal data.

	Args:
	detector (object): The trained model to be evaluated.
	test_samples (list): A list of test samples.
	input_type (str): The type of input data (e.g., multimodal).

	Returns:
	None
	"""
	# Convert test samples to Hugging Face Dataset format
	test_data = convert_to_huggingface_with_multimodel(test_samples)

	# Apply the appropriate preprocessing function based on the input type
	if input_type == MULTIMODEL:
	test_data = test_data.map(preprocess_function_multimodel, batched=True)
	elif input_type == SINGLE_FROM_MULTIMODEL:
	test_data = test_data.map(
	preprocess_function_single_from_multimodel,
	batched=True,
	)

	# Evaluate the model on the test data
	result = detector.evaluate(eval_dataset=test_data)

	# Extract and log the ROC AUC score
	roc_auc = result["eval_roc_auc"]
	write_to_file(OUTPUT_FILE, "roc_auc: %.1f%%" % (roc_auc * 100.0) + "\n")


	def extract_by_feature_kind(samples, feature_type):
	"""
	Extracts features from the given samples based on the specified feature
	type.

	Args:
	samples (list): A list of samples where each sample is a dictionary
	with 'text' and 'label' keys.
	feature_type (str): The type of feature to extract.

	Returns:
	tuple: A tuple containing the extracted features and corresponding
	labels.
	"""
	text_1_list = []
	text_2_list = []
	labels = []

	for sample in samples:
	text_1_list.append(sample["text"][0])
	text_2_list.append(sample["text"][1])
	labels.append(sample["label"])

	# Extract features in batch based on the feature type
	features = extract_feature_in_batch(text_1_list, text_2_list, feature_type)

	return features, labels


	def train_by_feature_kind(train_samples, feature_type):
	"""
	Trains a model using features extracted from the training samples based on
	the specified feature type.

	Args:
	train_samples (list): A list of training samples where each sample is
	a dictionary with 'text' and 'label' keys.
	feature_type (str): The type of feature to extract for training.

	Returns:
	object: The trained model.
	"""
	# Extract features and labels from the training samples
	features, labels = extract_by_feature_kind(train_samples, feature_type)

	# Convert features to a numpy array and reshape for training
	features = np.array(features)
	features = features.reshape(-1, 1)

	# Train the model using the extracted features and labels
	model = abstract_train(features, labels)

	return model


	def test_by_feature_kind(detector, samples, feature_type):
	"""
	Tests a detector using features extracted from the provided samples based
	on the specified feature type.

	Args:
	detector (object): The detector model to be evaluated.
	samples (list): A list of samples where each sample is a dictionary
	with 'text' and 'label' keys.
	feature_type (str): The type of feature to extract for testing.

	Returns:
	None
	"""
	# Extract features and labels from the samples
	features, labels = extract_by_feature_kind(samples, feature_type)

	# Convert features to a numpy array and reshape for evaluation
	features = np.array(features)
	features = features.reshape(-1, 1)

	# Evaluate the detector model using the extracted features and labels
	evaluate_model(detector, features, labels)


	def general_process_multimodels_train_val_test(
	train_samples,
	val_samples,
	test_samples,
	):
	"""
	General process for training, validating, and testing models using
	multi-model and feature kind approaches.

	Args:
	train_samples (list): Training samples.
	val_samples (list): Validation samples.
	test_samples (list): Test samples.

	Returns:
	None
	"""
	# Multi-model approach
	input_kind = MULTIMODEL
	write_to_file(OUTPUT_FILE, "\nInput kind = {input_kind} \n")

	# Train detector using multi-model with early stopping
	detector = train_by_transformer_with_multimodel_and_early_stop(
	train_samples,
	val_samples,
	input_kind,
	)

	# Evaluate on train set
	write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n")
	test_by_transformer_with_multimodel(detector, train_samples, input_kind)

	# Evaluate on validation set
	write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n")
	test_by_transformer_with_multimodel(detector, val_samples, input_kind)

	# Evaluate on test set
	write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n")
	test_by_transformer_with_multimodel(detector, test_samples, input_kind)

	# Single from multi-model approach
	input_kind = SINGLE_FROM_MULTIMODEL
	write_to_file(OUTPUT_FILE, "\nInput kind = {input_kind} \n")

	# Train detector using single from multi-model with early stopping
	detector = train_by_transformer_with_multimodel_and_early_stop(
	train_samples,
	val_samples,
	input_kind,
	)

	# Evaluate on train set
	write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n")
	test_by_transformer_with_multimodel(detector, train_samples, input_kind)

	# Evaluate on validation set
	write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n")
	test_by_transformer_with_multimodel(detector, val_samples, input_kind)

	# Evaluate on test set
	write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n")
	test_by_transformer_with_multimodel(detector, test_samples, input_kind)

	# Feature kind approach
	sample_length = len(train_samples[0]["text"])
	if (
	sample_length == 2
	): # Check if the sample length is 2, indicating BART feature kind
	feature_kind = BART
	write_to_file(OUTPUT_FILE, "\nFeature kind = {feature_kind} \n")

	# Train detector using feature kind
	detector = train_by_feature_kind(train_samples, feature_kind)

	# Evaluate on train set
	write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n")
	test_by_feature_kind(detector, train_samples, feature_kind)

	# Evaluate on validation set
	write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n")
	test_by_feature_kind(detector, val_samples, feature_kind)

	# Evaluate on test set
	write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n")
	test_by_feature_kind(detector, test_samples, feature_kind)


	def process_multi_models_with_validation(
	multimodel_csv_file,
	train_indices,
	test_indices,
	num_samples,
	):
	"""
	Processes multi-model data with validation, training, and testing.

	Args:
	multimodel_csv_file (str): Path to the CSV file containing
	multi-model data.
	train_indices (list): Indices for the training data.
	test_indices (list): Indices for the testing data.
	num_samples (int): Number of samples to process.

	Returns:
	None
	"""
	# Log the details of the process
	write_to_file(OUTPUT_FILE, f"PROCESSING FILE={multimodel_csv_file} \n")
	write_to_file(OUTPUT_FILE, f"EXPERIMENT WITH {MODEL_NAME} model \n")
	write_to_file(
	OUTPUT_FILE,
	f"NUMBER OF MAX EPOCHS WITH EARLY STOPPING =\
	{NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING} \n",
	)
	write_to_file(OUTPUT_FILE, f"PATIENCE = {PATIENCE} \n")
	write_to_file(OUTPUT_FILE, f"OPTIMIZED METRIC = {OPTIMIZED_METRIC} \n")
	write_to_file(OUTPUT_FILE, f"BATCH SIZE = {BATCH_SIZE} \n")
	write_to_file(OUTPUT_FILE, f"Number of samples = {num_samples} \n")

	# Read multi-model data from the CSV file
	data = parse_multimodal_data(multimodel_csv_file)

	# Limit data to the specified number of samples
	data = data[:num_samples]

	# Distribute data into training, validation, and testing sets
	train_samples, val_samples, test_samples = distribute_data(
	data,
	train_indices,
	test_indices,
	TRAIN_RATIO,
	VAL_RATIO,
	)

	# Log the training and testing indices
	write_to_file(
	OUTPUT_FILE,
	f"Multimodel training with train indices {train_indices},\
	test with test indices {test_indices} \n",
	)

	# Process the multi-models for training, validation, and testing
	general_process_multimodels_train_val_test(
	train_samples,
	val_samples,
	test_samples,
	)


	def split_train_val_test(data, train_ratio, val_ratio):
	"""
	Splits the dataset into training, validation, and test sets based on
	specified ratios.

	Args:
	data (list): The dataset to be split.
	train_ratio (float): The ratio of the dataset to be used for training.
	val_ratio (float): The ratio of the dataset to be used for validation.

	Returns:
	tuple: A tuple containing three lists
	(train_data, val_data, test_data).
	"""
	# Calculate the number of samples for the training set
	num_train_samples = int(len(data) * train_ratio)

	# Calculate the number of samples for the validation set
	num_val_samples = int(len(data) * val_ratio)

	# Split the data into training, validation, and test sets
	train_data = data[:num_train_samples]
	val_data = data[num_train_samples : (num_train_samples + num_val_samples)]
	test_data = data[(num_train_samples + num_val_samples) :]

	return train_data, val_data, test_data