# ==== # Fichier: interfaces/cuml_tfidf_vectorizer.py # ==== from typing import Union import cupy as cp import cudf from scipy.sparse import csr_matrix from cuml.feature_extraction.text import TfidfVectorizer as CuMLTfidf from interfaces.vectorizer import Vectorizer class CuMLTfidfVectorizer(Vectorizer): """ Implémentation concrète d'une vectorisation TF-IDF avec cuML. Les paramètres se basent sur un dict (e.g. venant de config.vectorization.tfidf). """ def __init__(self, **kwargs): self._vectorizer = CuMLTfidf(**kwargs) self._fitted = False def fit_transform(self, texts: cudf.Series) -> Union[cp.ndarray, csr_matrix]: """ Ajuste la TF-IDF sur les textes et retourne la matrice resultante. """ X = self._vectorizer.fit_transform(texts) self._fitted = True return X def transform(self, texts: cudf.Series) -> Union[cp.ndarray, csr_matrix]: """ Applique la TF-IDF déjà apprise et retourne la matrice resultante. """ if not self._fitted: raise ValueError("Vectorizer not yet fitted. Call fit_transform first.") X = self._vectorizer.transform(texts) return X