Spaces:
Sleeping
Sleeping
# ==== | |
# Fichier: interfaces/cuml_tfidf_vectorizer.py | |
# ==== | |
from typing import Union | |
import cupy as cp | |
import cudf | |
from scipy.sparse import csr_matrix | |
from cuml.feature_extraction.text import TfidfVectorizer as CuMLTfidf | |
from interfaces.vectorizer import Vectorizer | |
class CuMLTfidfVectorizer(Vectorizer): | |
""" | |
Implémentation concrète d'une vectorisation TF-IDF avec cuML. | |
Les paramètres se basent sur un dict (e.g. venant de config.vectorization.tfidf). | |
""" | |
def __init__(self, **kwargs): | |
self._vectorizer = CuMLTfidf(**kwargs) | |
self._fitted = False | |
def fit_transform(self, texts: cudf.Series) -> Union[cp.ndarray, csr_matrix]: | |
""" | |
Ajuste la TF-IDF sur les textes et retourne la matrice resultante. | |
""" | |
X = self._vectorizer.fit_transform(texts) | |
self._fitted = True | |
return X | |
def transform(self, texts: cudf.Series) -> Union[cp.ndarray, csr_matrix]: | |
""" | |
Applique la TF-IDF déjà apprise et retourne la matrice resultante. | |
""" | |
if not self._fitted: | |
raise ValueError("Vectorizer not yet fitted. Call fit_transform first.") | |
X = self._vectorizer.transform(texts) | |
return X |