emotion_classifier / src /interfaces /cuml_tfidf_vectorizer.py
fioriclass's picture
initialisation
bf5fb5f
# ====
# Fichier: interfaces/cuml_tfidf_vectorizer.py
# ====
from typing import Union
import cupy as cp
import cudf
from scipy.sparse import csr_matrix
from cuml.feature_extraction.text import TfidfVectorizer as CuMLTfidf
from interfaces.vectorizer import Vectorizer
class CuMLTfidfVectorizer(Vectorizer):
"""
Implémentation concrète d'une vectorisation TF-IDF avec cuML.
Les paramètres se basent sur un dict (e.g. venant de config.vectorization.tfidf).
"""
def __init__(self, **kwargs):
self._vectorizer = CuMLTfidf(**kwargs)
self._fitted = False
def fit_transform(self, texts: cudf.Series) -> Union[cp.ndarray, csr_matrix]:
"""
Ajuste la TF-IDF sur les textes et retourne la matrice resultante.
"""
X = self._vectorizer.fit_transform(texts)
self._fitted = True
return X
def transform(self, texts: cudf.Series) -> Union[cp.ndarray, csr_matrix]:
"""
Applique la TF-IDF déjà apprise et retourne la matrice resultante.
"""
if not self._fitted:
raise ValueError("Vectorizer not yet fitted. Call fit_transform first.")
X = self._vectorizer.transform(texts)
return X