Spaces:
Sleeping
Sleeping
@startuml | |
!theme plain | |
skinparam classAttributeIconSize 0 | |
skinparam defaultFontName Arial | |
skinparam class { | |
BackgroundColor PaleTurquoise | |
BorderColor DarkSlateGray | |
} | |
skinparam abstractClass { | |
BackgroundColor LightYellow | |
BorderColor DarkSlateGray | |
} | |
skinparam interface { | |
BackgroundColor White | |
BorderColor Black | |
} | |
' ----------- Type ParamGrid ----------- | |
class ParamGrid { | |
note as N1 | |
Exemples de clés et valeurs : | |
- "C": [0.1, 1, 10, 100] | |
- "kernel": ["linear", "rbf"] | |
- "gamma": | |
low: 0.001 | |
high: 0.1 | |
log: true | |
Ce type sert à décrire la structure attendue pour le param_grid | |
dans les Optimizers. | |
end note | |
} | |
' ----------- Interfaces ----------- | |
package "Interfaces" { | |
interface MetricsCalculator { | |
+ calculate_and_log(y_true: cp.ndarray, y_pred: cp.ndarray, prefix: str): dict | |
+ calculate_and_log_multiclass(y_true: cp.ndarray, y_pred: cp.ndarray, prefix: str): dict | |
-- | |
note bottom | |
Génère toujours : accuracy, f1, precision, recall, auc-roc | |
end note | |
} | |
interface Vectorizer { | |
+ fit_transform(texts: cudf.Series): cp.ndarray | csr_matrix | |
+ transform(texts: cudf.Series): cp.ndarray | csr_matrix | |
} | |
interface HyperparameterOptimizer { | |
+ optimize(trainer: BaseTrainer, param_grid: ParamGrid): dict | |
-- | |
note bottom | |
Renvoie un dict correspondant \nà la best config trouvée | |
ex: {"C": 1, "kernel": "linear", "gamma": 0.01} | |
end note | |
} | |
} | |
' ----------- Base Classes ----------- | |
package "Base Classes" { | |
abstract class BaseTrainer { | |
- config: Config | |
- classifier: object | |
- metrics_calculator: MetricsCalculator | |
-- | |
+ __init__(config: Config, data_path: str, target_column: str) | |
+ build_components(): void | |
+ train(): void | |
+ evaluate(): void | |
+ log_parameters_to_mlflow(): void | |
+ optimize_if_needed(): void | |
- | |
_prepare_input_for_fit(X: cp.ndarray | csr_matrix): cp.ndarray | csr_matrix | |
- | |
_prepare_input_for_predict(X: cp.ndarray | csr_matrix): cp.ndarray | csr_matrix | |
- | |
_get_binary_predictions(X: cp.ndarray): cp.ndarray | |
- | |
_get_positive_probabilities(X: cp.ndarray): cp.ndarray | None | |
- | |
_get_label_dtype(): cp.dtype | |
note right | |
log_parameters_to_mlflow(): | |
appelle la fonction singledispatch | |
get_relevant_params_for_logging(self). | |
optimize_if_needed(): | |
Vérifie dans self.config \nsi un optimizer est défini, \npuis appelle optimize() \nsur ce dernier si besoin. | |
end note | |
} | |
abstract class CuMLTrainer extends BaseTrainer { | |
- vectorizer: Vectorizer | |
- classifier: cuML.Base | |
-- | |
+ build_components(): void | |
+ train(): void | |
+ evaluate(): void | |
- | |
_prepare_input_for_fit(X: cp.ndarray | csr_matrix): cp.ndarray | |
- | |
_prepare_input_for_predict(X: cp.ndarray | csr_matrix): cp.ndarray | |
} | |
} | |
' ----------- Concrete Trainers (cuML) ----------- | |
package "Concrete Trainers (cuML)" { | |
class SvmTrainer extends CuMLTrainer { | |
- classifier: SVC | |
-- | |
+ _build_classifier(): void | |
note bottom | |
SvmTrainer est affecté \npar les paramètres C, kernel, \net gamma (pour RBF). | |
end note | |
} | |
class RandomForestTrainer extends CuMLTrainer { | |
- classifier: RandomForestClassifier | |
-- | |
+ _build_classifier(): void | |
} | |
class LogisticRegressionTrainer extends CuMLTrainer { | |
- classifier: LogisticRegression | |
-- | |
+ _build_classifier(): void | |
} | |
class LinearRegressionTrainer extends CuMLTrainer { | |
- classifier: LinearRegression | |
-- | |
+ _build_classifier(): void | |
} | |
} | |
' ----------- Concrete Trainers (Hugging Face) ----------- | |
package "Concrete Trainers (Hugging Face)" { | |
class HuggingFaceTransformerTrainer extends BaseTrainer { | |
- tokenizer: AutoTokenizer | |
- model: AutoModelForSequenceClassification | |
- hf_trainer: Trainer | |
-- | |
+ build_components(): void | |
+ train(): void | |
+ evaluate(): void | |
- | |
_create_torch_dataset(texts: cudf.Series, labels: cp.ndarray): torch.utils.data.Dataset | |
- | |
_prepare_training_args(): TrainingArguments | |
note right | |
Ce trainer n'utilise pas | |
la config vectorizer | |
end note | |
} | |
} | |
' ----------- Hyperparameter Optimizers ----------- | |
package "Hyperparameter Optimizers" { | |
class OptunaOptimizer { | |
- study: optuna.study.Study | |
- objective: function | |
-- | |
+ optimize(trainer: BaseTrainer, param_grid: ParamGrid): dict | |
-- | |
note bottom | |
Implementation: | |
1) Crée/récupère un study Optuna. | |
2) Définit l'objective (fonction de coût). | |
Ex: Utilise param_grid["C"] \npour suggérer \ntrial.suggest_float("C",...) | |
3) Applique les hyperparams au trainer \n(e.g. trainer.classifier = SVC(**params)). | |
4) study.optimize(..., n_trials=...) | |
5) Retourne la meilleure config sous forme d'un dict | |
end note | |
} | |
class RayTuneOptimizer { | |
- param_space: dict | |
- search_alg: object | |
- scheduler: object | |
-- | |
+ optimize(trainer: BaseTrainer, param_grid: ParamGrid): dict | |
-- | |
note bottom | |
Implementation: | |
1) Convertit param_grid \nen param_space pour Ray Tune. | |
(ex: "C": tune.grid_search([...])) | |
2) Lance tune.run(...). | |
3) Utilise search_alg/scheduler. | |
4) Retourne la meilleure config \nsous forme d'un dict | |
end note | |
} | |
OptunaOptimizer ..> HyperparameterOptimizer : «implements» | |
RayTuneOptimizer ..> HyperparameterOptimizer : «implements» | |
} | |
' ----------- MLflow Integration ----------- | |
package "MLflow Integration" { | |
class MLflowDecorator { | |
- experiment_name: str | |
- tracking_uri: str | |
-- | |
+ __init__(experiment_name: str, tracking_uri: str): void | |
+ __call__(func: function): function | |
+ _start_run(): void | |
+ _log_params(params: dict): void | |
+ _log_metrics(metrics: dict): void | |
+ _log_artifacts(artifacts: dict): void | |
+ _end_run(status: str): void | |
} | |
} | |
' ----------- Utilities ----------- | |
package "Utilities" { | |
class CuMLPyFuncWrapper { | |
- vectorizer: Vectorizer | |
- classifier: object | |
-- | |
+ load_context(context): void | |
+ predict(context, model_input: pd.DataFrame): np.ndarray | |
} | |
} | |
' ----------- Configuration ----------- | |
package "Configuration" { | |
class Config <<PydanticModel>> { | |
+ model: ModelConfig | |
+ vectorization: VectorizationConfig | |
+ data: DataConfig | |
+ hyperparameters: HyperparameterConfig | |
} | |
class ModelConfig <<PydanticModel>> { | |
+ type: str | |
+ params: dict | |
} | |
class VectorizationConfig <<PydanticModel>> { | |
+ method: str | |
+ tfidf: dict | |
+ bow: dict | |
} | |
class DataConfig <<PydanticModel>> { | |
+ path: str | |
+ target_column: str | |
} | |
class HyperparameterConfig <<PydanticModel>> { | |
+ optimizer: str | |
+ param_grid: dict | |
+ n_trials: int | |
-- | |
note bottom | |
Exemple de param_grid pour SVM: | |
\{ | |
"C": [0.1, 1, 10, 100], | |
"kernel": ["linear", "rbf"], | |
"gamma": \{ | |
"low": 0.001, | |
"high": 0.1, | |
"log": true | |
\} | |
\} | |
n_trials: 50 | |
end note | |
} | |
Config <|-- ModelConfig | |
Config <|-- VectorizationConfig | |
Config <|-- DataConfig | |
Config <|-- HyperparameterConfig | |
note left of Config | |
Extrait YAML: | |
hyperparameters: | |
optimizer: "optuna" | |
param_grid: | |
C: [0.1, 1, 10, 100] | |
kernel: | |
- "linear" | |
- "rbf" | |
gamma: | |
low: 0.001 | |
high: 0.1 | |
log: true | |
n_trials: 50 | |
= | |
Hydra -> DictConfig -> Config(Pydantic) | |
end note | |
} | |
' ----------- singledispatch function ----------- | |
package "Parameter Logging (singledispatch)" { | |
object get_relevant_params_for_logging <<Function>> | |
note bottom | |
@singledispatch | |
def get_relevant_params_for_logging(trainer: BaseTrainer) -> dict: | |
... | |
@get_relevant_params_for_logging.register | |
def _(trainer: HuggingFaceTransformerTrainer) -> dict: | |
... | |
@get_relevant_params_for_logging.register | |
def _(trainer: SvmTrainer) -> dict: | |
... | |
etc. | |
end note | |
} | |
' ----------- Relations ----------- | |
BaseTrainer ..> MetricsCalculator : «uses» | |
BaseTrainer ..> HyperparameterOptimizer : «may use» | |
BaseTrainer ..> MLflowDecorator : «may be decorated by» | |
BaseTrainer ..> get_relevant_params_for_logging : «calls singledispatch function» | |
CuMLTrainer ..> cuML.Base : «uses» | |
CuMLTrainer ..> CuMLPyFuncWrapper : «for saving model» | |
HuggingFaceTransformerTrainer ..> AutoTokenizer : «uses» | |
HuggingFaceTransformerTrainer ..> AutoModelForSequenceClassification : «uses» | |
HuggingFaceTransformerTrainer ..> Trainer : «uses» | |
HuggingFaceTransformerTrainer ..> TrainingArguments : «uses» | |
MLflowDecorator ..> mlflow : «uses» | |
@enduml |