|
@startuml |
|
!theme plain |
|
|
|
skinparam classAttributeIconSize 0 |
|
skinparam defaultFontName Arial |
|
skinparam class { |
|
BackgroundColor PaleTurquoise |
|
BorderColor DarkSlateGray |
|
} |
|
skinparam abstractClass { |
|
BackgroundColor LightYellow |
|
BorderColor DarkSlateGray |
|
} |
|
skinparam interface { |
|
BackgroundColor White |
|
BorderColor Black |
|
} |
|
|
|
' ----------- Type ParamGrid ----------- |
|
class ParamGrid { |
|
note as N1 |
|
Exemples de clés et valeurs : |
|
- "C": [0.1, 1, 10, 100] |
|
- "kernel": ["linear", "rbf"] |
|
- "gamma": |
|
low: 0.001 |
|
high: 0.1 |
|
log: true |
|
|
|
Ce type sert à décrire la structure attendue pour le param_grid |
|
dans les Optimizers. |
|
end note |
|
} |
|
|
|
' ----------- Interfaces ----------- |
|
package "Interfaces" { |
|
interface MetricsCalculator { |
|
+ calculate_and_log(y_true: cp.ndarray, y_pred: cp.ndarray, prefix: str): dict |
|
+ calculate_and_log_multiclass(y_true: cp.ndarray, y_pred: cp.ndarray, prefix: str): dict |
|
-- |
|
note bottom |
|
Génère toujours : accuracy, f1, precision, recall, auc-roc |
|
end note |
|
} |
|
|
|
interface Vectorizer { |
|
+ fit_transform(texts: cudf.Series): cp.ndarray | csr_matrix |
|
+ transform(texts: cudf.Series): cp.ndarray | csr_matrix |
|
} |
|
|
|
interface HyperparameterOptimizer { |
|
+ optimize(trainer: BaseTrainer, param_grid: ParamGrid): dict |
|
-- |
|
note bottom |
|
Renvoie un dict correspondant \nà la best config trouvée |
|
ex: {"C": 1, "kernel": "linear", "gamma": 0.01} |
|
end note |
|
} |
|
} |
|
|
|
|
|
' ----------- Base Classes ----------- |
|
package "Base Classes" { |
|
abstract class BaseTrainer { |
|
- config: Config |
|
- classifier: object |
|
- metrics_calculator: MetricsCalculator |
|
-- |
|
+ __init__(config: Config, data_path: str, target_column: str) |
|
+ build_components(): void |
|
+ train(): void |
|
+ evaluate(): void |
|
+ log_parameters_to_mlflow(): void |
|
+ optimize_if_needed(): void |
|
- |
|
_prepare_input_for_fit(X: cp.ndarray | csr_matrix): cp.ndarray | csr_matrix |
|
- |
|
_prepare_input_for_predict(X: cp.ndarray | csr_matrix): cp.ndarray | csr_matrix |
|
- |
|
_get_binary_predictions(X: cp.ndarray): cp.ndarray |
|
- |
|
_get_positive_probabilities(X: cp.ndarray): cp.ndarray | None |
|
- |
|
_get_label_dtype(): cp.dtype |
|
|
|
note right |
|
log_parameters_to_mlflow(): |
|
appelle la fonction singledispatch |
|
get_relevant_params_for_logging(self). |
|
|
|
optimize_if_needed(): |
|
Vérifie dans self.config \nsi un optimizer est défini, \npuis appelle optimize() \nsur ce dernier si besoin. |
|
end note |
|
} |
|
|
|
abstract class CuMLTrainer extends BaseTrainer { |
|
- vectorizer: Vectorizer |
|
- classifier: cuML.Base |
|
-- |
|
+ build_components(): void |
|
+ train(): void |
|
+ evaluate(): void |
|
- |
|
_prepare_input_for_fit(X: cp.ndarray | csr_matrix): cp.ndarray |
|
- |
|
_prepare_input_for_predict(X: cp.ndarray | csr_matrix): cp.ndarray |
|
} |
|
} |
|
|
|
' ----------- Concrete Trainers (cuML) ----------- |
|
package "Concrete Trainers (cuML)" { |
|
class SvmTrainer extends CuMLTrainer { |
|
- classifier: SVC |
|
-- |
|
+ _build_classifier(): void |
|
note bottom |
|
SvmTrainer est affecté \npar les paramètres C, kernel, \net gamma (pour RBF). |
|
end note |
|
} |
|
|
|
class RandomForestTrainer extends CuMLTrainer { |
|
- classifier: RandomForestClassifier |
|
-- |
|
+ _build_classifier(): void |
|
} |
|
|
|
class LogisticRegressionTrainer extends CuMLTrainer { |
|
- classifier: LogisticRegression |
|
-- |
|
+ _build_classifier(): void |
|
} |
|
|
|
class LinearRegressionTrainer extends CuMLTrainer { |
|
- classifier: LinearRegression |
|
-- |
|
+ _build_classifier(): void |
|
} |
|
} |
|
|
|
' ----------- Concrete Trainers (Hugging Face) ----------- |
|
package "Concrete Trainers (Hugging Face)" { |
|
class HuggingFaceTransformerTrainer extends BaseTrainer { |
|
- tokenizer: AutoTokenizer |
|
- model: AutoModelForSequenceClassification |
|
- hf_trainer: Trainer |
|
-- |
|
+ build_components(): void |
|
+ train(): void |
|
+ evaluate(): void |
|
- |
|
_create_torch_dataset(texts: cudf.Series, labels: cp.ndarray): torch.utils.data.Dataset |
|
- |
|
_prepare_training_args(): TrainingArguments |
|
|
|
note right |
|
Ce trainer n'utilise pas |
|
la config vectorizer |
|
end note |
|
} |
|
} |
|
|
|
' ----------- Hyperparameter Optimizers ----------- |
|
package "Hyperparameter Optimizers" { |
|
class OptunaOptimizer { |
|
- study: optuna.study.Study |
|
- objective: function |
|
-- |
|
+ optimize(trainer: BaseTrainer, param_grid: ParamGrid): dict |
|
-- |
|
note bottom |
|
Implementation: |
|
1) Crée/récupère un study Optuna. |
|
2) Définit l'objective (fonction de coût). |
|
Ex: Utilise param_grid["C"] \npour suggérer \ntrial.suggest_float("C",...) |
|
3) Applique les hyperparams au trainer \n(e.g. trainer.classifier = SVC(**params)). |
|
4) study.optimize(..., n_trials=...) |
|
5) Retourne la meilleure config sous forme d'un dict |
|
end note |
|
} |
|
|
|
class RayTuneOptimizer { |
|
- param_space: dict |
|
- search_alg: object |
|
- scheduler: object |
|
-- |
|
+ optimize(trainer: BaseTrainer, param_grid: ParamGrid): dict |
|
-- |
|
note bottom |
|
Implementation: |
|
1) Convertit param_grid \nen param_space pour Ray Tune. |
|
(ex: "C": tune.grid_search([...])) |
|
2) Lance tune.run(...). |
|
3) Utilise search_alg/scheduler. |
|
4) Retourne la meilleure config \nsous forme d'un dict |
|
end note |
|
} |
|
|
|
OptunaOptimizer ..> HyperparameterOptimizer : «implements» |
|
RayTuneOptimizer ..> HyperparameterOptimizer : «implements» |
|
} |
|
|
|
' ----------- MLflow Integration ----------- |
|
package "MLflow Integration" { |
|
class MLflowDecorator { |
|
- experiment_name: str |
|
- tracking_uri: str |
|
-- |
|
+ __init__(experiment_name: str, tracking_uri: str): void |
|
+ __call__(func: function): function |
|
+ _start_run(): void |
|
+ _log_params(params: dict): void |
|
+ _log_metrics(metrics: dict): void |
|
+ _log_artifacts(artifacts: dict): void |
|
+ _end_run(status: str): void |
|
} |
|
} |
|
|
|
' ----------- Utilities ----------- |
|
package "Utilities" { |
|
class CuMLPyFuncWrapper { |
|
- vectorizer: Vectorizer |
|
- classifier: object |
|
-- |
|
+ load_context(context): void |
|
+ predict(context, model_input: pd.DataFrame): np.ndarray |
|
} |
|
} |
|
|
|
' ----------- Configuration ----------- |
|
package "Configuration" { |
|
class Config <<PydanticModel>> { |
|
+ model: ModelConfig |
|
+ vectorization: VectorizationConfig |
|
+ data: DataConfig |
|
+ hyperparameters: HyperparameterConfig |
|
} |
|
|
|
class ModelConfig <<PydanticModel>> { |
|
+ type: str |
|
+ params: dict |
|
} |
|
|
|
class VectorizationConfig <<PydanticModel>> { |
|
+ method: str |
|
+ tfidf: dict |
|
+ bow: dict |
|
} |
|
|
|
class DataConfig <<PydanticModel>> { |
|
+ path: str |
|
+ target_column: str |
|
} |
|
|
|
class HyperparameterConfig <<PydanticModel>> { |
|
+ optimizer: str |
|
+ param_grid: dict |
|
+ n_trials: int |
|
-- |
|
note bottom |
|
Exemple de param_grid pour SVM: |
|
\{ |
|
"C": [0.1, 1, 10, 100], |
|
"kernel": ["linear", "rbf"], |
|
"gamma": \{ |
|
"low": 0.001, |
|
"high": 0.1, |
|
"log": true |
|
\} |
|
\} |
|
n_trials: 50 |
|
end note |
|
} |
|
Config <|-- ModelConfig |
|
Config <|-- VectorizationConfig |
|
Config <|-- DataConfig |
|
Config <|-- HyperparameterConfig |
|
|
|
note left of Config |
|
Extrait YAML: |
|
hyperparameters: |
|
optimizer: "optuna" |
|
param_grid: |
|
C: [0.1, 1, 10, 100] |
|
kernel: |
|
- "linear" |
|
- "rbf" |
|
gamma: |
|
low: 0.001 |
|
high: 0.1 |
|
log: true |
|
n_trials: 50 |
|
= |
|
Hydra -> DictConfig -> Config(Pydantic) |
|
end note |
|
} |
|
|
|
' ----------- singledispatch function ----------- |
|
package "Parameter Logging (singledispatch)" { |
|
object get_relevant_params_for_logging <<Function>> |
|
note bottom |
|
@singledispatch |
|
def get_relevant_params_for_logging(trainer: BaseTrainer) -> dict: |
|
... |
|
|
|
@get_relevant_params_for_logging.register |
|
def _(trainer: HuggingFaceTransformerTrainer) -> dict: |
|
... |
|
|
|
@get_relevant_params_for_logging.register |
|
def _(trainer: SvmTrainer) -> dict: |
|
... |
|
|
|
etc. |
|
end note |
|
} |
|
|
|
' ----------- Relations ----------- |
|
BaseTrainer ..> MetricsCalculator : «uses» |
|
BaseTrainer ..> HyperparameterOptimizer : «may use» |
|
BaseTrainer ..> MLflowDecorator : «may be decorated by» |
|
BaseTrainer ..> get_relevant_params_for_logging : «calls singledispatch function» |
|
|
|
CuMLTrainer ..> cuML.Base : «uses» |
|
CuMLTrainer ..> CuMLPyFuncWrapper : «for saving model» |
|
|
|
HuggingFaceTransformerTrainer ..> AutoTokenizer : «uses» |
|
HuggingFaceTransformerTrainer ..> AutoModelForSequenceClassification : «uses» |
|
HuggingFaceTransformerTrainer ..> Trainer : «uses» |
|
HuggingFaceTransformerTrainer ..> TrainingArguments : «uses» |
|
|
|
MLflowDecorator ..> mlflow : «uses» |
|
|
|
@enduml |