|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.vectorstores import Chroma |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.document_loaders import BSHTMLLoader |
|
from langchain.chains import RetrievalQA |
|
from langchain.document_loaders import TextLoader |
|
from langchain.document_loaders import DirectoryLoader |
|
|
|
|
|
loader = DirectoryLoader('.', glob="./source/*.html", loader_cls=BSHTMLLoader) |
|
docs=loader.load() |
|
|
|
|
|
print("splitting to chunks") |
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) |
|
texts = text_splitter.split_documents(docs) |
|
|
|
|
|
print("Creating embeddings") |
|
persist_directory="./index/chroma" |
|
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") |
|
print("Storing in db") |
|
vectordb = Chroma.from_documents(documents=texts, |
|
embedding=embeddings, |
|
persist_directory=persist_directory) |
|
|
|
sentences = ["This is an example sentence", "Each sentence is converted","A monkey in zoo","Shark in the park","Boss on loose","a quiet date"] |
|
query="This sentence is an example" |
|
query2="Each sentence is converted" |
|
id=["a1","a2","a3","a4","a5","a6"] |
|
meta=[{"n":1},{"z":2},{},{},{"n":3},{"n":4}] |
|
|
|
|
|
|
|
print("Querying db") |
|
query="How to Increase Flexibility Without Losing Productivity" |
|
docs = vectordb.similarity_search(query) |
|
for i in range(0,len(docs)): |
|
print("\n") |
|
print(docs[i].page_content) |
|
print("Done") |