|
|
|
|
|
|
|
|
|
from typing import Optional, Dict, List, Any |
|
import cupy as cp |
|
import numpy as np |
|
import cudf |
|
import torch |
|
import torch.nn.functional as F |
|
|
|
from cuml.model_selection import train_test_split |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EvalPrediction |
|
from datasets import Dataset as HFDataset |
|
|
|
from cuml.metrics import accuracy_score, precision_recall_curve, roc_auc_score |
|
|
|
import cupy as cp |
|
|
|
|
|
|
|
from base_trainer import BaseTrainer |
|
from config import Config |
|
|
|
|
|
|
|
def compute_metrics(p: EvalPrediction) -> Dict[str, float]: |
|
logits = p.predictions |
|
|
|
labels_cp = cp.asarray(p.label_ids) |
|
|
|
preds_cp = cp.argmax(cp.asarray(logits), axis=1) |
|
|
|
|
|
probas_torch = F.softmax(torch.tensor(logits), dim=-1) |
|
probas_cp = cp.asarray(probas_torch) |
|
|
|
|
|
proba_pos_class = probas_cp[:, 1] |
|
|
|
|
|
acc = accuracy_score(labels_cp, preds_cp) |
|
|
|
|
|
auc = roc_auc_score(labels_cp.astype(cp.int32), proba_pos_class.astype(cp.float32)) |
|
|
|
|
|
precision, recall, thresholds = precision_recall_curve( |
|
labels_cp.astype(cp.int32), proba_pos_class.astype(cp.float32) |
|
) |
|
|
|
|
|
optimal_precision, optimal_recall, optimal_f1, optimal_threshold = calculate_optimal_f1( |
|
precision, recall, thresholds |
|
) |
|
|
|
|
|
metrics = { |
|
"accuracy": float(acc), |
|
"precision": float(optimal_precision), |
|
"recall": float(optimal_recall), |
|
"f1": float(optimal_f1), |
|
"optimal_threshold": float(optimal_threshold), |
|
"auc_roc": float(auc) |
|
} |
|
|
|
return metrics |
|
|
|
def calculate_optimal_f1(precision: cp.ndarray, recall: cp.ndarray, thresholds: cp.ndarray): |
|
""" |
|
Calcule le F1 score optimal à partir des courbes de précision et de rappel. |
|
|
|
Args: |
|
precision: Tableau de précisions pour différents seuils |
|
recall: Tableau de rappels pour différents seuils |
|
thresholds: Tableau de seuils correspondants |
|
|
|
Returns: |
|
Tuple contenant (précision optimale, rappel optimal, F1 score optimal, seuil optimal) |
|
""" |
|
|
|
thresholds_with_one = cp.append(thresholds, cp.array([1.0])) |
|
|
|
|
|
|
|
f1_scores = 2 * (precision * recall) / (precision + recall) |
|
|
|
|
|
best_idx = cp.argmax(f1_scores) |
|
best_precision = float(precision[best_idx]) |
|
best_recall = float(recall[best_idx]) |
|
best_f1 = float(f1_scores[best_idx]) |
|
|
|
|
|
best_threshold = float(thresholds_with_one[best_idx]) |
|
|
|
return best_precision, best_recall, best_f1, best_threshold |
|
|
|
class HuggingFaceTransformerTrainer(BaseTrainer): |
|
""" |
|
Entraîneur spécifique Hugging Face, utilisant un tokenizer, |
|
un modèle AutoModelForSequenceClassification et un HF Trainer. |
|
Ne dépend pas d'un vectorizer cuML d'après l'UML. |
|
""" |
|
|
|
def __init__(self, config: Config, data_path: str, |
|
target_column: str) -> None: |
|
""" |
|
Initialise un HuggingFaceTransformerTrainer avec la configuration |
|
et les paramètres du parent BaseTrainer. |
|
|
|
:param config: Configuration globale du système. |
|
(La config.vectorizer n'est pas utilisée ici.) |
|
:param data_path: Chemin vers le fichier de données. |
|
:param target_column: Nom de la colonne cible dans vos données. |
|
""" |
|
super().__init__(config, data_path, target_column) |
|
super().__init__(config, data_path, target_column) |
|
self.tokenizer: Optional[AutoTokenizer] = None |
|
self.model: Optional[AutoModelForSequenceClassification] = None |
|
self.hf_trainer: Optional[Trainer] = None |
|
self.train_dataset: Optional[HFDataset] = None |
|
self.eval_dataset: Optional[HFDataset] = None |
|
self.test_dataset: Optional[HFDataset] = None |
|
|
|
def build_components(self) -> None: |
|
""" |
|
Instancie le tokenizer et le modèle Hugging Face |
|
AutoModelForSequenceClassification, puis crée un Trainer |
|
avec des TrainingArguments par défaut. |
|
""" |
|
model_name = self.config.model.params.get("model_name") |
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
self.model = AutoModelForSequenceClassification.from_pretrained( |
|
model_name) |
|
training_args = self._prepare_training_args() |
|
|
|
|
|
|
|
self.hf_trainer = Trainer( |
|
model=self.model, |
|
args=training_args, |
|
train_dataset=self.train_dataset, |
|
eval_dataset=self.eval_dataset, |
|
compute_metrics=compute_metrics, |
|
tokenizer=self.tokenizer, |
|
callbacks=[] |
|
) |
|
|
|
def train(self) -> None: |
|
""" |
|
Entraîne le modèle Hugging Face sur le jeu de données. |
|
""" |
|
|
|
df = cudf.read_csv(self.data_path) |
|
|
|
labels = cp.asarray(df[self.target_column].astype(int)) |
|
|
|
features_df = df.drop(columns=[self.target_column]).astype(str) |
|
texts = features_df[features_df.columns[0]] |
|
for col in features_df.columns[1:]: |
|
texts = texts.str.cat(features_df[col], sep=' ') |
|
|
|
|
|
texts_for_storage = texts.copy() |
|
|
|
|
|
|
|
indices = cp.arange(len(texts)) |
|
|
|
|
|
train_indices, temp_indices, y_train, y_temp = train_test_split( |
|
indices, labels, test_size=0.2, random_state=42, stratify=labels |
|
) |
|
|
|
|
|
val_indices, test_indices, y_val, y_test = train_test_split( |
|
temp_indices, y_temp, test_size=0.5, random_state=42, stratify=y_temp |
|
) |
|
|
|
|
|
X_train_text = texts_for_storage.iloc[train_indices.get()] |
|
X_val_text = texts_for_storage.iloc[val_indices.get()] |
|
X_test_text = texts_for_storage.iloc[test_indices.get()] |
|
|
|
|
|
def create_hf_dataset(text_series: cudf.Series, label_array: cp.ndarray) -> HFDataset: |
|
|
|
texts_list = text_series.to_arrow().to_pylist() |
|
|
|
labels_list = cp.asnumpy(label_array).tolist() |
|
|
|
encodings = self.tokenizer(texts_list, padding=True, truncation=True) |
|
|
|
data_dict = { |
|
"input_ids": encodings["input_ids"], |
|
"attention_mask": encodings["attention_mask"], |
|
"labels": labels_list |
|
} |
|
return HFDataset.from_dict(data_dict) |
|
|
|
|
|
self.train_dataset = create_hf_dataset(X_train_text, y_train) |
|
self.eval_dataset = create_hf_dataset(X_val_text, y_val) |
|
self.test_dataset = create_hf_dataset(X_test_text, y_test) |
|
|
|
|
|
self.hf_trainer.train_dataset = self.train_dataset |
|
self.hf_trainer.eval_dataset = self.eval_dataset |
|
|
|
|
|
print(f"Starting training with {len(self.train_dataset)} samples.") |
|
print(f"Validation during training with {len(self.eval_dataset)} samples.") |
|
print(f"Test set prepared with {len(self.test_dataset)} samples.") |
|
self.hf_trainer.train() |
|
|
|
def evaluate(self) -> dict: |
|
""" |
|
Évalue le modèle Hugging Face; la logique de calcul |
|
des métriques est en partie assurée par le HF Trainer. |
|
|
|
:return: Dictionnaire contenant les métriques calculées sur l'ensemble de test. |
|
""" |
|
if self.hf_trainer is None or self.test_dataset is None: |
|
raise ValueError("Trainer or test dataset not initialized. Run train() first.") |
|
|
|
print(f"Evaluating on the test set ({len(self.test_dataset)} samples)...") |
|
|
|
results = self.hf_trainer.predict(self.test_dataset) |
|
|
|
|
|
|
|
print("Evaluation results:", results.metrics) |
|
return results.metrics |
|
|
|
def _create_torch_dataset(self, texts: cudf.Series, |
|
labels: cp.ndarray) -> torch.utils.data.Dataset: |
|
""" |
|
Convertit un cudf.Series de textes et un tableau cupy de labels |
|
en un Dataset PyTorch. |
|
|
|
:param texts: Série cudf contenant les textes. |
|
:param labels: Vecteur cupy des labels (ex. classification binaire ou multiclasses). |
|
:return: Un Dataset PyTorch utilisable par Trainer. |
|
""" |
|
|
|
|
|
raise NotImplementedError( |
|
"La méthode _create_torch_dataset n'est plus utilisée directement." |
|
) |
|
|
|
def _prepare_training_args(self) -> TrainingArguments: |
|
""" |
|
Construit un objet TrainingArguments Hugging Face, |
|
par exemple pour définir l'output_dir, le batch_size, etc. |
|
|
|
:return: Instance de TrainingArguments configurée. |
|
""" |
|
params = self.config.model.params |
|
return TrainingArguments( |
|
output_dir="./results", |
|
num_train_epochs=float(params.get("epochs")), |
|
per_device_train_batch_size=int(params.get("batch_size")), |
|
per_device_eval_batch_size=int(params.get("batch_size")), |
|
learning_rate=float(params.get("learning_rate")), |
|
warmup_steps=int(params.get("warmup_steps")), |
|
weight_decay=float(params.get("weight_decay")), |
|
save_steps=50, |
|
logging_dir="./logs", |
|
logging_strategy="no", |
|
save_strategy="epoch", |
|
report_to="mlflow" |
|
) |
|
|
|
def optimize_if_needed(self) -> None: |
|
""" |
|
Surcharge la méthode optimize_if_needed de BaseTrainer pour désactiver |
|
l'optimisation des hyperparamètres pour les modèles transformers. |
|
""" |
|
|
|
return |
|
|