Spaces:
Sleeping
Sleeping
Commit
·
43d4438
1
Parent(s):
fa9c546
correction bug config
Browse files- src/base_trainer.py +12 -12
- src/conf/config.yaml +1 -1
- src/config.py +17 -1
- src/cuml_trainer.py +11 -12
- src/interfaces/metrics_calculator.py +28 -3
- src/main.py +11 -11
- src/trainers/cuml/linear_regression_trainer.py +4 -4
- src/trainers/cuml/logistic_regression_trainer.py +4 -4
- src/trainers/cuml/random_forest_trainer.py +4 -4
- src/trainers/cuml/svm_trainer.py +4 -4
- src/trainers/huggingface/huggingface_transformer_trainer.py +2 -2
src/base_trainer.py
CHANGED
@@ -6,9 +6,11 @@ from abc import ABC, abstractmethod
|
|
6 |
from typing import Union, Optional
|
7 |
import cupy as cp
|
8 |
from scipy.sparse import csr_matrix
|
|
|
|
|
9 |
|
10 |
-
from config import Config
|
11 |
-
from interfaces.metrics_calculator import MetricsCalculator
|
12 |
|
13 |
|
14 |
class BaseTrainer(ABC):
|
@@ -73,34 +75,32 @@ class BaseTrainer(ABC):
|
|
73 |
logger = logging.getLogger(__name__)
|
74 |
|
75 |
# Vérifier si l'optimisation est configurée
|
76 |
-
if (self.config.hyperparameters.optimizer and
|
77 |
-
self.config.hyperparameters.param_grid and
|
78 |
self.config.hyperparameters.n_trials > 0):
|
79 |
-
|
80 |
logger.info("Démarrage de l'optimisation des hyperparamètres")
|
81 |
-
|
82 |
# Importation et instanciation de l'optimiseur
|
83 |
optimizer_type = self.config.hyperparameters.optimizer.lower()
|
84 |
if optimizer_type == "optuna":
|
85 |
-
from optimizers.optuna_optimizer import OptunaOptimizer
|
86 |
optimizer = OptunaOptimizer()
|
87 |
elif optimizer_type == "raytune":
|
88 |
-
from optimizers.ray_tune_optimizer import RayTuneOptimizer
|
89 |
optimizer = RayTuneOptimizer()
|
90 |
else:
|
91 |
raise ValueError(f"Type d'optimizer non supporté: {optimizer_type}")
|
92 |
-
|
93 |
# Lancement de l'optimisation
|
94 |
best_params = optimizer.optimize(
|
95 |
trainer=self, # Passe l'instance actuelle du trainer
|
96 |
param_grid=self.config.hyperparameters.param_grid
|
97 |
)
|
98 |
-
|
99 |
logger.info(f"Meilleurs hyperparamètres trouvés: {best_params}")
|
100 |
-
|
101 |
# Mise à jour de la configuration du modèle avec les meilleurs paramètres
|
102 |
self.config.model.params.update(best_params)
|
103 |
-
|
104 |
# Reconstruire les composants avec les nouveaux paramètres
|
105 |
logger.info("Reconstruction des composants avec les hyperparamètres optimisés.")
|
106 |
self.build_components()
|
|
|
6 |
from typing import Union, Optional
|
7 |
import cupy as cp
|
8 |
from scipy.sparse import csr_matrix
|
9 |
+
from src.optimizers.optuna_optimizer import OptunaOptimizer
|
10 |
+
from src.optimizers.ray_tune_optimizer import RayTuneOptimizer
|
11 |
|
12 |
+
from src.config import Config
|
13 |
+
from src.interfaces.metrics_calculator import MetricsCalculator
|
14 |
|
15 |
|
16 |
class BaseTrainer(ABC):
|
|
|
75 |
logger = logging.getLogger(__name__)
|
76 |
|
77 |
# Vérifier si l'optimisation est configurée
|
78 |
+
if (self.config.hyperparameters.optimizer and
|
79 |
+
self.config.hyperparameters.param_grid and
|
80 |
self.config.hyperparameters.n_trials > 0):
|
81 |
+
|
82 |
logger.info("Démarrage de l'optimisation des hyperparamètres")
|
83 |
+
|
84 |
# Importation et instanciation de l'optimiseur
|
85 |
optimizer_type = self.config.hyperparameters.optimizer.lower()
|
86 |
if optimizer_type == "optuna":
|
|
|
87 |
optimizer = OptunaOptimizer()
|
88 |
elif optimizer_type == "raytune":
|
|
|
89 |
optimizer = RayTuneOptimizer()
|
90 |
else:
|
91 |
raise ValueError(f"Type d'optimizer non supporté: {optimizer_type}")
|
92 |
+
|
93 |
# Lancement de l'optimisation
|
94 |
best_params = optimizer.optimize(
|
95 |
trainer=self, # Passe l'instance actuelle du trainer
|
96 |
param_grid=self.config.hyperparameters.param_grid
|
97 |
)
|
98 |
+
|
99 |
logger.info(f"Meilleurs hyperparamètres trouvés: {best_params}")
|
100 |
+
|
101 |
# Mise à jour de la configuration du modèle avec les meilleurs paramètres
|
102 |
self.config.model.params.update(best_params)
|
103 |
+
|
104 |
# Reconstruire les composants avec les nouveaux paramètres
|
105 |
logger.info("Reconstruction des composants avec les hyperparamètres optimisés.")
|
106 |
self.build_components()
|
src/conf/config.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
# conf/config.yaml
|
2 |
|
3 |
defaults:
|
4 |
-
-
|
5 |
- _self_
|
6 |
|
7 |
model:
|
|
|
1 |
# conf/config.yaml
|
2 |
|
3 |
defaults:
|
4 |
+
- model: model
|
5 |
- _self_
|
6 |
|
7 |
model:
|
src/config.py
CHANGED
@@ -68,12 +68,28 @@ class HyperparameterConfig(BaseModel):
|
|
68 |
description="Nombre d'essais pour la recherche d'hyperparamètres.")
|
69 |
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
class Config(BaseModel):
|
72 |
"""
|
73 |
Objet de configuration global combinant la section modèle, vectorisation,
|
74 |
-
données et
|
75 |
"""
|
76 |
model: ModelConfig
|
77 |
vectorization: VectorizationConfig
|
78 |
data: DataConfig
|
79 |
hyperparameters: HyperparameterConfig
|
|
|
|
68 |
description="Nombre d'essais pour la recherche d'hyperparamètres.")
|
69 |
|
70 |
|
71 |
+
class MLflowConfig(BaseModel):
|
72 |
+
"""
|
73 |
+
Représente la configuration pour MLflow, incluant le nom de l'expérience
|
74 |
+
et l'URI de tracking.
|
75 |
+
"""
|
76 |
+
experiment_name: str = Field(
|
77 |
+
...,
|
78 |
+
description="Nom de l'expérience MLflow."
|
79 |
+
)
|
80 |
+
tracking_uri: str = Field(
|
81 |
+
...,
|
82 |
+
description="URI de tracking MLflow."
|
83 |
+
)
|
84 |
+
|
85 |
+
|
86 |
class Config(BaseModel):
|
87 |
"""
|
88 |
Objet de configuration global combinant la section modèle, vectorisation,
|
89 |
+
données, hyperparamètres et MLflow.
|
90 |
"""
|
91 |
model: ModelConfig
|
92 |
vectorization: VectorizationConfig
|
93 |
data: DataConfig
|
94 |
hyperparameters: HyperparameterConfig
|
95 |
+
mlflow: MLflowConfig
|
src/cuml_trainer.py
CHANGED
@@ -6,10 +6,11 @@ from abc import ABC, abstractmethod
|
|
6 |
from typing import Union
|
7 |
import cupy as cp
|
8 |
from scipy.sparse import csr_matrix
|
|
|
9 |
|
10 |
-
from config import Config
|
11 |
-
from base_trainer import BaseTrainer
|
12 |
-
from interfaces.vectorizer import Vectorizer
|
13 |
|
14 |
|
15 |
class CuMLTrainer(BaseTrainer, ABC):
|
@@ -52,17 +53,16 @@ class CuMLTrainer(BaseTrainer, ABC):
|
|
52 |
Cette implémentation générique fonctionne pour tous les trainers cuML.
|
53 |
"""
|
54 |
# Chargement des données
|
55 |
-
|
56 |
data = cudf.read_csv(self.data_path)
|
57 |
-
|
58 |
# Séparation des textes et des étiquettes
|
59 |
texts = data.drop(columns=[self.target_column])
|
60 |
labels = data[self.target_column].values
|
61 |
-
|
62 |
# Vectorisation des textes
|
63 |
X = self.vectorizer.fit_transform(texts)
|
64 |
X_prepared = self._prepare_input_for_fit(X)
|
65 |
-
|
66 |
# Entraînement du modèle
|
67 |
self.classifier.fit(X_prepared, labels)
|
68 |
|
@@ -72,18 +72,17 @@ class CuMLTrainer(BaseTrainer, ABC):
|
|
72 |
Cette implémentation générique fonctionne pour tous les trainers cuML.
|
73 |
"""
|
74 |
# Chargement des données (idéalement un jeu de test séparé)
|
75 |
-
import cudf
|
76 |
data = cudf.read_csv(self.data_path)
|
77 |
-
|
78 |
# Séparation des textes et des étiquettes
|
79 |
texts = data.drop(columns=[self.target_column])
|
80 |
y_true = data[self.target_column].values
|
81 |
-
|
82 |
# Vectorisation et prédiction
|
83 |
X = self.vectorizer.transform(texts)
|
84 |
X_prepared = self._prepare_input_for_predict(X)
|
85 |
y_pred = self.classifier.predict(X_prepared)
|
86 |
-
|
87 |
# Calcul et logging des métriques
|
88 |
prefix = self.config.model.type.lower()
|
89 |
metrics = self.metrics_calculator.calculate_and_log(
|
@@ -91,7 +90,7 @@ class CuMLTrainer(BaseTrainer, ABC):
|
|
91 |
y_pred=y_pred,
|
92 |
prefix=prefix
|
93 |
)
|
94 |
-
|
95 |
# Afficher les résultats
|
96 |
print(f"Métriques d'évaluation {prefix}: {metrics}")
|
97 |
|
|
|
6 |
from typing import Union
|
7 |
import cupy as cp
|
8 |
from scipy.sparse import csr_matrix
|
9 |
+
import cudf
|
10 |
|
11 |
+
from src.config import Config
|
12 |
+
from src.base_trainer import BaseTrainer
|
13 |
+
from src.interfaces.vectorizer import Vectorizer
|
14 |
|
15 |
|
16 |
class CuMLTrainer(BaseTrainer, ABC):
|
|
|
53 |
Cette implémentation générique fonctionne pour tous les trainers cuML.
|
54 |
"""
|
55 |
# Chargement des données
|
56 |
+
|
57 |
data = cudf.read_csv(self.data_path)
|
58 |
+
|
59 |
# Séparation des textes et des étiquettes
|
60 |
texts = data.drop(columns=[self.target_column])
|
61 |
labels = data[self.target_column].values
|
62 |
+
|
63 |
# Vectorisation des textes
|
64 |
X = self.vectorizer.fit_transform(texts)
|
65 |
X_prepared = self._prepare_input_for_fit(X)
|
|
|
66 |
# Entraînement du modèle
|
67 |
self.classifier.fit(X_prepared, labels)
|
68 |
|
|
|
72 |
Cette implémentation générique fonctionne pour tous les trainers cuML.
|
73 |
"""
|
74 |
# Chargement des données (idéalement un jeu de test séparé)
|
|
|
75 |
data = cudf.read_csv(self.data_path)
|
76 |
+
|
77 |
# Séparation des textes et des étiquettes
|
78 |
texts = data.drop(columns=[self.target_column])
|
79 |
y_true = data[self.target_column].values
|
80 |
+
|
81 |
# Vectorisation et prédiction
|
82 |
X = self.vectorizer.transform(texts)
|
83 |
X_prepared = self._prepare_input_for_predict(X)
|
84 |
y_pred = self.classifier.predict(X_prepared)
|
85 |
+
|
86 |
# Calcul et logging des métriques
|
87 |
prefix = self.config.model.type.lower()
|
88 |
metrics = self.metrics_calculator.calculate_and_log(
|
|
|
90 |
y_pred=y_pred,
|
91 |
prefix=prefix
|
92 |
)
|
93 |
+
|
94 |
# Afficher les résultats
|
95 |
print(f"Métriques d'évaluation {prefix}: {metrics}")
|
96 |
|
src/interfaces/metrics_calculator.py
CHANGED
@@ -1,9 +1,34 @@
|
|
1 |
import cupy as cp
|
2 |
import numpy as np
|
3 |
-
from typing import Dict
|
4 |
import logging
|
5 |
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
logger = logging.getLogger(__name__)
|
9 |
|
@@ -81,4 +106,4 @@ class DefaultMetricsCalculator(MetricsCalculator):
|
|
81 |
f"{prefix}_auc_roc" : auc
|
82 |
}
|
83 |
logger.info(f"[{prefix}] Multiclass metrics: {metrics}")
|
84 |
-
return metrics
|
|
|
1 |
import cupy as cp
|
2 |
import numpy as np
|
3 |
+
from typing import Dict, Protocol
|
4 |
import logging
|
5 |
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
|
6 |
+
|
7 |
+
class MetricsCalculator(Protocol):
|
8 |
+
"""
|
9 |
+
Interface pour les calculateurs de métriques.
|
10 |
+
"""
|
11 |
+
def calculate_and_log(
|
12 |
+
self,
|
13 |
+
y_true: cp.ndarray,
|
14 |
+
y_pred: cp.ndarray,
|
15 |
+
prefix: str
|
16 |
+
) -> Dict[str, float]:
|
17 |
+
"""
|
18 |
+
Calcule et log les métriques pour un problème binaire.
|
19 |
+
"""
|
20 |
+
pass
|
21 |
+
|
22 |
+
def calculate_and_log_multiclass(
|
23 |
+
self,
|
24 |
+
y_true: cp.ndarray,
|
25 |
+
y_pred: cp.ndarray,
|
26 |
+
prefix: str
|
27 |
+
) -> Dict[str, float]:
|
28 |
+
"""
|
29 |
+
Calcule et log les métriques pour un problème multiclasses.
|
30 |
+
"""
|
31 |
+
pass
|
32 |
|
33 |
logger = logging.getLogger(__name__)
|
34 |
|
|
|
106 |
f"{prefix}_auc_roc" : auc
|
107 |
}
|
108 |
logger.info(f"[{prefix}] Multiclass metrics: {metrics}")
|
109 |
+
return metrics
|
src/main.py
CHANGED
@@ -10,21 +10,21 @@ from omegaconf import DictConfig, OmegaConf
|
|
10 |
import mlflow
|
11 |
|
12 |
# Import des trainers
|
13 |
-
from trainers.cuml.svm_trainer import SvmTrainer
|
14 |
-
from trainers.cuml.random_forest_trainer import RandomForestTrainer
|
15 |
-
from trainers.cuml.logistic_regression_trainer import LogisticRegressionTrainer
|
16 |
-
from trainers.cuml.linear_regression_trainer import LinearRegressionTrainer
|
17 |
-
from trainers.huggingface.huggingface_transformer_trainer import HuggingFaceTransformerTrainer
|
18 |
|
19 |
# Import des optimizers
|
20 |
-
from optimizers.optuna_optimizer import OptunaOptimizer
|
21 |
-
from optimizers.ray_tune_optimizer import RayTuneOptimizer
|
22 |
|
23 |
# Import du décorateur MLflow
|
24 |
-
from mlflow_integration.mlflow_decorator import MLflowDecorator
|
25 |
|
26 |
# Import de la configuration
|
27 |
-
from config import Config
|
28 |
|
29 |
# Configuration du logging
|
30 |
logger = logging.getLogger(__name__)
|
@@ -89,7 +89,7 @@ def get_optimizer(config: Config):
|
|
89 |
return optimizer_class()
|
90 |
|
91 |
|
92 |
-
@hydra.main(config_path="conf", config_name="config")
|
93 |
def main(cfg: DictConfig) -> None:
|
94 |
"""
|
95 |
Point d'entrée principal de l'application.
|
@@ -139,4 +139,4 @@ def main(cfg: DictConfig) -> None:
|
|
139 |
|
140 |
|
141 |
if __name__ == "__main__":
|
142 |
-
main()
|
|
|
10 |
import mlflow
|
11 |
|
12 |
# Import des trainers
|
13 |
+
from src.trainers.cuml.svm_trainer import SvmTrainer
|
14 |
+
from src.trainers.cuml.random_forest_trainer import RandomForestTrainer
|
15 |
+
from src.trainers.cuml.logistic_regression_trainer import LogisticRegressionTrainer
|
16 |
+
from src.trainers.cuml.linear_regression_trainer import LinearRegressionTrainer
|
17 |
+
from src.trainers.huggingface.huggingface_transformer_trainer import HuggingFaceTransformerTrainer
|
18 |
|
19 |
# Import des optimizers
|
20 |
+
from src.optimizers.optuna_optimizer import OptunaOptimizer
|
21 |
+
from src.optimizers.ray_tune_optimizer import RayTuneOptimizer
|
22 |
|
23 |
# Import du décorateur MLflow
|
24 |
+
from src.mlflow_integration.mlflow_decorator import MLflowDecorator
|
25 |
|
26 |
# Import de la configuration
|
27 |
+
from src.config import Config
|
28 |
|
29 |
# Configuration du logging
|
30 |
logger = logging.getLogger(__name__)
|
|
|
89 |
return optimizer_class()
|
90 |
|
91 |
|
92 |
+
@hydra.main(config_path="conf", config_name="config", version_base=None)
|
93 |
def main(cfg: DictConfig) -> None:
|
94 |
"""
|
95 |
Point d'entrée principal de l'application.
|
|
|
139 |
|
140 |
|
141 |
if __name__ == "__main__":
|
142 |
+
main()
|
src/trainers/cuml/linear_regression_trainer.py
CHANGED
@@ -4,10 +4,10 @@
|
|
4 |
|
5 |
from cuml.linear_model import LinearRegression
|
6 |
|
7 |
-
from cuml_trainer import CuMLTrainer
|
8 |
-
from config import Config
|
9 |
-
from interfaces.cuml_tfidf_vectorizer import CuMLTfidfVectorizer
|
10 |
-
from interfaces.metrics_calculator import DefaultMetricsCalculator
|
11 |
|
12 |
class LinearRegressionTrainer(CuMLTrainer):
|
13 |
"""
|
|
|
4 |
|
5 |
from cuml.linear_model import LinearRegression
|
6 |
|
7 |
+
from src.cuml_trainer import CuMLTrainer
|
8 |
+
from src.config import Config
|
9 |
+
from src.interfaces.cuml_tfidf_vectorizer import CuMLTfidfVectorizer
|
10 |
+
from src.interfaces.metrics_calculator import DefaultMetricsCalculator
|
11 |
|
12 |
class LinearRegressionTrainer(CuMLTrainer):
|
13 |
"""
|
src/trainers/cuml/logistic_regression_trainer.py
CHANGED
@@ -4,10 +4,10 @@
|
|
4 |
|
5 |
from cuml.linear_model import LogisticRegression
|
6 |
|
7 |
-
from cuml_trainer import CuMLTrainer
|
8 |
-
from config import Config
|
9 |
-
from interfaces.cuml_tfidf_vectorizer import CuMLTfidfVectorizer
|
10 |
-
from interfaces.metrics_calculator import DefaultMetricsCalculator
|
11 |
|
12 |
class LogisticRegressionTrainer(CuMLTrainer):
|
13 |
"""
|
|
|
4 |
|
5 |
from cuml.linear_model import LogisticRegression
|
6 |
|
7 |
+
from src.cuml_trainer import CuMLTrainer
|
8 |
+
from src.config import Config
|
9 |
+
from src.interfaces.cuml_tfidf_vectorizer import CuMLTfidfVectorizer
|
10 |
+
from src.interfaces.metrics_calculator import DefaultMetricsCalculator
|
11 |
|
12 |
class LogisticRegressionTrainer(CuMLTrainer):
|
13 |
"""
|
src/trainers/cuml/random_forest_trainer.py
CHANGED
@@ -6,10 +6,10 @@ from typing import Optional
|
|
6 |
from cuml.ensemble import RandomForestClassifier
|
7 |
import cupy as cp
|
8 |
|
9 |
-
from cuml_trainer import CuMLTrainer
|
10 |
-
from config import Config
|
11 |
-
from interfaces.cuml_tfidf_vectorizer import CuMLTfidfVectorizer
|
12 |
-
from interfaces.metrics_calculator import DefaultMetricsCalculator
|
13 |
|
14 |
class RandomForestTrainer(CuMLTrainer):
|
15 |
"""
|
|
|
6 |
from cuml.ensemble import RandomForestClassifier
|
7 |
import cupy as cp
|
8 |
|
9 |
+
from src.cuml_trainer import CuMLTrainer
|
10 |
+
from src.config import Config
|
11 |
+
from src.interfaces.cuml_tfidf_vectorizer import CuMLTfidfVectorizer
|
12 |
+
from src.interfaces.metrics_calculator import DefaultMetricsCalculator
|
13 |
|
14 |
class RandomForestTrainer(CuMLTrainer):
|
15 |
"""
|
src/trainers/cuml/svm_trainer.py
CHANGED
@@ -6,10 +6,10 @@ from cuml.svm import SVC
|
|
6 |
import cupy as cp
|
7 |
from typing import Optional
|
8 |
|
9 |
-
from cuml_trainer import CuMLTrainer
|
10 |
-
from config import Config
|
11 |
-
from interfaces.cuml_tfidf_vectorizer import CuMLTfidfVectorizer
|
12 |
-
from interfaces.metrics_calculator import DefaultMetricsCalculator
|
13 |
|
14 |
class SvmTrainer(CuMLTrainer):
|
15 |
"""
|
|
|
6 |
import cupy as cp
|
7 |
from typing import Optional
|
8 |
|
9 |
+
from src.cuml_trainer import CuMLTrainer
|
10 |
+
from src.config import Config
|
11 |
+
from src.interfaces.cuml_tfidf_vectorizer import CuMLTfidfVectorizer
|
12 |
+
from src.interfaces.metrics_calculator import DefaultMetricsCalculator
|
13 |
|
14 |
class SvmTrainer(CuMLTrainer):
|
15 |
"""
|
src/trainers/huggingface/huggingface_transformer_trainer.py
CHANGED
@@ -8,8 +8,8 @@ import cudf
|
|
8 |
import torch
|
9 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
|
10 |
|
11 |
-
from base_trainer import BaseTrainer
|
12 |
-
from config import Config
|
13 |
|
14 |
|
15 |
class HuggingFaceTransformerTrainer(BaseTrainer):
|
|
|
8 |
import torch
|
9 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
|
10 |
|
11 |
+
from src.base_trainer import BaseTrainer
|
12 |
+
from src.config import Config
|
13 |
|
14 |
|
15 |
class HuggingFaceTransformerTrainer(BaseTrainer):
|