Spaces:
Sleeping
Sleeping
from langchain_chroma import Chroma | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_chroma import Chroma | |
from langchain_core.documents import Document | |
import json | |
from uuid import uuid4 | |
print("Loading embedding model...") | |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") | |
vector_store = Chroma( | |
collection_name="example_collection", | |
embedding_function=embeddings, | |
persist_directory="./chroma_langchain_db", # Where to save data locally, remove if not necessary | |
) | |
# Load the metadata.jsonl file | |
with open('metadata.jsonl', 'r') as jsonl_file: | |
json_list = list(jsonl_file) | |
json_QA = [] | |
for json_str in json_list: | |
json_data = json.loads(json_str) | |
json_QA.append(json_data) | |
docs = [] | |
for idx, sample in enumerate(json_QA): | |
content = f"Question: {sample['Question']}\n\nFinal answer: {sample['Final answer']}" | |
doc = Document( | |
page_content=content, | |
metadata={ | |
"source": sample['task_id'], | |
}, | |
id=str(uuid4()), | |
) | |
docs.append(doc) | |
# Add documents to the vector store | |
print("Adding documents to the vector store...") | |
vector_store.add_documents(documents=docs) | |
del docs | |
del json_QA |