Spaces:
Sleeping
Sleeping
File size: 1,223 Bytes
bf5fb5f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# ====
# Fichier: interfaces/cuml_tfidf_vectorizer.py
# ====
from typing import Union
import cupy as cp
import cudf
from scipy.sparse import csr_matrix
from cuml.feature_extraction.text import TfidfVectorizer as CuMLTfidf
from interfaces.vectorizer import Vectorizer
class CuMLTfidfVectorizer(Vectorizer):
"""
Implémentation concrète d'une vectorisation TF-IDF avec cuML.
Les paramètres se basent sur un dict (e.g. venant de config.vectorization.tfidf).
"""
def __init__(self, **kwargs):
self._vectorizer = CuMLTfidf(**kwargs)
self._fitted = False
def fit_transform(self, texts: cudf.Series) -> Union[cp.ndarray, csr_matrix]:
"""
Ajuste la TF-IDF sur les textes et retourne la matrice resultante.
"""
X = self._vectorizer.fit_transform(texts)
self._fitted = True
return X
def transform(self, texts: cudf.Series) -> Union[cp.ndarray, csr_matrix]:
"""
Applique la TF-IDF déjà apprise et retourne la matrice resultante.
"""
if not self._fitted:
raise ValueError("Vectorizer not yet fitted. Call fit_transform first.")
X = self._vectorizer.transform(texts)
return X |