from langchain_chroma import Chroma from langchain_huggingface import HuggingFaceEmbeddings from langchain_chroma import Chroma from langchain_core.documents import Document import json from uuid import uuid4 print("Loading embedding model...") embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") vector_store = Chroma( collection_name="example_collection", embedding_function=embeddings, persist_directory="./chroma_langchain_db", # Where to save data locally, remove if not necessary ) # Load the metadata.jsonl file with open('metadata.jsonl', 'r') as jsonl_file: json_list = list(jsonl_file) json_QA = [] for json_str in json_list: json_data = json.loads(json_str) json_QA.append(json_data) docs = [] for idx, sample in enumerate(json_QA): content = f"Question: {sample['Question']}\n\nFinal answer: {sample['Final answer']}" doc = Document( page_content=content, metadata={ "source": sample['task_id'], }, id=str(uuid4()), ) docs.append(doc) # Add documents to the vector store print("Adding documents to the vector store...") vector_store.add_documents(documents=docs) del docs del json_QA