chroma / indexer.py
anubhav77's picture
Adding sentence transformers and some logs
3a51e33
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import BSHTMLLoader
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
#Load the documents
loader = DirectoryLoader('.', glob="./source/*.html", loader_cls=BSHTMLLoader)
docs=loader.load()
#splitting the text into chunks, trying with 1000 size
print("splitting to chunks")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(docs)
#init db and embeddings
print("Creating embeddings")
persist_directory="./index/chroma"
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
print("Storing in db")
vectordb = Chroma.from_documents(documents=texts,
embedding=embeddings,
persist_directory=persist_directory)
sentences = ["This is an example sentence", "Each sentence is converted","A monkey in zoo","Shark in the park","Boss on loose","a quiet date"]
query="This sentence is an example"
query2="Each sentence is converted"
id=["a1","a2","a3","a4","a5","a6"]
meta=[{"n":1},{"z":2},{},{},{"n":3},{"n":4}]
#docsearch=FAISS.from_texts(sentences,embeddings,meta,id)
#m=docsearch.similarity_search_with_score(query2,filter={"n":2})
print("Querying db")
query="How to Increase Flexibility Without Losing Productivity"
docs = vectordb.similarity_search(query)
for i in range(0,len(docs)):
print("\n")
print(docs[i].page_content)
print("Done")