File size: 1,754 Bytes
e6eabd3
 
 
 
 
 
 
 
 
 
 
 
 
3a51e33
e6eabd3
 
 
 
3a51e33
e6eabd3
 
3a51e33
e6eabd3
 
 
 
 
 
 
 
 
 
 
 
3a51e33
e6eabd3
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import BSHTMLLoader
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader

#Load the documents
loader = DirectoryLoader('.', glob="./source/*.html", loader_cls=BSHTMLLoader)
docs=loader.load()

#splitting the text into chunks, trying with 1000 size 
print("splitting to chunks")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(docs)

#init db and embeddings
print("Creating embeddings")
persist_directory="./index/chroma"
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") 
print("Storing in db")
vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=embeddings,
                                 persist_directory=persist_directory)
                                 
sentences = ["This is an example sentence", "Each sentence is converted","A monkey in zoo","Shark in the park","Boss on loose","a quiet date"]   
query="This sentence is an example"  
query2="Each sentence is converted"   
id=["a1","a2","a3","a4","a5","a6"] 
meta=[{"n":1},{"z":2},{},{},{"n":3},{"n":4}] 

#docsearch=FAISS.from_texts(sentences,embeddings,meta,id)   
#m=docsearch.similarity_search_with_score(query2,filter={"n":2}) 
print("Querying db")
query="How to Increase Flexibility Without Losing Productivity"
docs = vectordb.similarity_search(query)
for i in range(0,len(docs)):
    print("\n")
    print(docs[i].page_content)
print("Done")