Spaces:
Sleeping
Sleeping
File size: 1,229 Bytes
f224484 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
import json
from uuid import uuid4
print("Loading embedding model...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
vector_store = Chroma(
collection_name="example_collection",
embedding_function=embeddings,
persist_directory="./chroma_langchain_db", # Where to save data locally, remove if not necessary
)
# Load the metadata.jsonl file
with open('metadata.jsonl', 'r') as jsonl_file:
json_list = list(jsonl_file)
json_QA = []
for json_str in json_list:
json_data = json.loads(json_str)
json_QA.append(json_data)
docs = []
for idx, sample in enumerate(json_QA):
content = f"Question: {sample['Question']}\n\nFinal answer: {sample['Final answer']}"
doc = Document(
page_content=content,
metadata={
"source": sample['task_id'],
},
id=str(uuid4()),
)
docs.append(doc)
# Add documents to the vector store
print("Adding documents to the vector store...")
vector_store.add_documents(documents=docs)
del docs
del json_QA |