|
|
|
|
|
|
|
|
|
from typing import Union |
|
import cupy as cp |
|
import cudf |
|
from scipy.sparse import csr_matrix |
|
from cuml.feature_extraction.text import TfidfVectorizer as CuMLTfidf |
|
from interfaces.vectorizer import Vectorizer |
|
|
|
|
|
class CuMLTfidfVectorizer(Vectorizer): |
|
""" |
|
Implémentation concrète d'une vectorisation TF-IDF avec cuML. |
|
Les paramètres se basent sur un dict (e.g. venant de config.vectorization.tfidf). |
|
""" |
|
def __init__(self, **kwargs): |
|
self._vectorizer = CuMLTfidf(**kwargs) |
|
self._fitted = False |
|
|
|
def fit_transform(self, texts: cudf.Series) -> Union[cp.ndarray, csr_matrix]: |
|
""" |
|
Ajuste la TF-IDF sur les textes et retourne la matrice resultante. |
|
""" |
|
X = self._vectorizer.fit_transform(texts) |
|
self._fitted = True |
|
return X |
|
|
|
def transform(self, texts: cudf.Series) -> Union[cp.ndarray, csr_matrix]: |
|
""" |
|
Applique la TF-IDF déjà apprise et retourne la matrice resultante. |
|
""" |
|
if not self._fitted: |
|
raise ValueError("Vectorizer not yet fitted. Call fit_transform first.") |
|
X = self._vectorizer.transform(texts) |
|
return X |