File size: 5,441 Bytes
2ce0b48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import chromadb
from chromadb.utils import embedding_functions
from tqdm import tqdm
import time


####################################################################################################################################
############################################# GLOBAL INGESTION #####################################################################
####################################################################################################################################
def prepare_chunks_for_ingestion(df):
    """
    Specialisé pour les fichiers RSE
    """
    chunks = list(df.full_chunk)
    metadatas = [
        {
         "source": str(source),
         "chunk_size": str(chunk_size),
        }
        for source, chunk_size in zip(list(df.source), list(df.chunk_size))
    ]
    return chunks, metadatas


###################################################################################################################################
def ingest_chunks(df=None, batch_size=100, create_collection=False, chroma_data_path="./chroma_data/", embedding_model="intfloat/multilingual-e5-large", collection_name=None):
    """
    Adds to a RAG database from a dataframe with metadata and text already read. And returns the question answering pipeline.
    Documents already chunked ! 
    Custom file slicing from self-care data.
    Parameters:
    - df the dataframe of chunked docs with their metadata and text
    - batch_size (optional)
    Returns:
    - collection: the resulting chroma collection
    - duration: the list of duration of batch ingestion
    """
    
    print("Modèle d'embedding choisi: ", embedding_model)
    print("Collection où ingérer: ", collection_name)
    # La collection du vector store est censée déjà exister.
    client = chromadb.PersistentClient(path=chroma_data_path)
    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
                 model_name=embedding_model)
    
    if create_collection:
        collection = client.create_collection(
            name=collection_name,
            embedding_function=embedding_func,
            metadata={"hnsw:space": "cosine"},
        )
        next_id = 0
    else:
        collection = client.get_collection(name=collection_name, embedding_function=embedding_func)
        print("Computing next chroma id. Please wait a few minutes...")
        next_id = compute_next_id_chroma(chroma_data_path, collection_name)
    print("Préparation des métadatas des chunks :")
    documents, metadatas = prepare_chunks_for_ingestion(df)
    # batch adding to do it faster
    durations = []
    total_batches = len(documents)/batch_size
    initialisation=True
    for i in tqdm(range(0, len(documents), batch_size)):
        # print(f"Processing batch number {i/batch_size} of {total_batches}...")
        if initialisation:
            print(f"Processing first batch of {total_batches}.")
            print("This can take 10-15 mins if this is the first time the model is loaded. Please wait...")
            initialisation=False
        with open("ingesting.log", "a") as file:
            file.write(f"Processing batch number {i/batch_size} of {total_batches}..." +"\n")
        batch_documents = documents[i:i+batch_size]
        batch_ids = [f"id{j}" for j in range(next_id+i, next_id+i+len(batch_documents))]
        batch_metadatas = metadatas[i:i+batch_size]
        start_time = time.time() # start measuring execution time
        collection.add(
            documents=batch_documents,
            ids=batch_ids, # [f"id{i}" for i in range(len(documents))],
            metadatas=batch_metadatas
        )
        end_time = time.time() # end measuring execution time
        with open("ingesting.log", "a") as file:
            file.write(f"Done. Collection adding time: {end_time-start_time}"+"\n")
        durations.append(end_time-start_time) # store execution times per batch
    return collection, durations


###################################################################################################################################
def clean_rag_collection(collname,chroma_data_path):
    """ Removes the old ollection for the RAG to ingest data new.
    """
    client = chromadb.PersistentClient(path=chroma_data_path)
    res = client.delete_collection(name=collname)
    return res


###################################################################################################################################
def retrieve_info_from_db(prompt: str, entreprise=None):
    EMBED_MODEL = 'intfloat/multilingual-e5-large'
    collection_name = "RSE_CSRD_REPORTS_TEST"
    # création du client
    client = chromadb.PersistentClient(path="./data/chroma_data/")
    # chargement du modèle d'embedding permettant le calcul de proximité sémantique
    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=EMBED_MODEL
    )
    collection = client.get_collection(name=collection_name, embedding_function=embedding_func)
    if entreprise is not None:
        # requête
        query_results = collection.query(
                query_texts=[prompt],
                n_results=3,
                where={'source': entreprise}
            )
    else:
        # requête
        query_results = collection.query(
                query_texts=[prompt],
                n_results=3
            )
        
    return query_results