File size: 1,223 Bytes
bf5fb5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# ====
# Fichier: interfaces/cuml_tfidf_vectorizer.py
# ====

from typing import Union
import cupy as cp
import cudf
from scipy.sparse import csr_matrix
from cuml.feature_extraction.text import TfidfVectorizer as CuMLTfidf
from interfaces.vectorizer import Vectorizer


class CuMLTfidfVectorizer(Vectorizer):
    """
    Implémentation concrète d'une vectorisation TF-IDF avec cuML.
    Les paramètres se basent sur un dict (e.g. venant de config.vectorization.tfidf).
    """
    def __init__(self, **kwargs):
        self._vectorizer = CuMLTfidf(**kwargs)
        self._fitted = False

    def fit_transform(self, texts: cudf.Series) -> Union[cp.ndarray, csr_matrix]:
        """
        Ajuste la TF-IDF sur les textes et retourne la matrice resultante.
        """
        X = self._vectorizer.fit_transform(texts)
        self._fitted = True
        return X

    def transform(self, texts: cudf.Series) -> Union[cp.ndarray, csr_matrix]:
        """
        Applique la TF-IDF déjà apprise et retourne la matrice resultante.
        """
        if not self._fitted:
            raise ValueError("Vectorizer not yet fitted. Call fit_transform first.")
        X = self._vectorizer.transform(texts)
        return X