File size: 1,150 Bytes
f52dfc6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1dd505d
2ebb694
 
f52dfc6
 
 
 
 
 
 
 
3c03a29
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from sentence_transformers import SentenceTransformer
import chromadb

def split_list(list_,chunk_size):
            return [list_[i:i+chunk_size] for i in range(0,len(list_),chunk_size)]

def create_database(txt):
    class EmbeddingFn:
        def __init__(self,model_name):
            self.model = SentenceTransformer(model_name)
        
        def __call__(self,input):
            return self.model.encode(input).tolist()

    embedding_fn = EmbeddingFn("sentence-transformers/all-mpnet-base-v2")

    ids = [str(i) for i in range(len(txt))]

    chromadb.api.client.SharedSystemClient.clear_system_cache()
    
    chroma_cli = chromadb.Client()
    existing_collections = [collection.name for collection in chroma_cli.list_collections()]
    if "chat-with-docs" in existing_collections:
        chroma_cli.delete_collection(name="chat-with-docs")
    collection = chroma_cli.create_collection("chat-with-docs",embedding_function=embedding_fn)
    
    txt = split_list(txt,5000)
    ids = split_list(ids,5000)

    for txt_chunk,ids_chunk in zip(txt,ids):
        collection.add(documents=txt_chunk,ids=ids_chunk)
    
    return collection