File size: 1,229 Bytes
f224484
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
import json

from uuid import uuid4


print("Loading embedding model...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

# Load the metadata.jsonl file
with open('metadata.jsonl', 'r') as jsonl_file:
    json_list = list(jsonl_file)

json_QA = []
for json_str in json_list:
    json_data = json.loads(json_str)
    json_QA.append(json_data)
    
docs = []
for idx, sample in enumerate(json_QA):
    content = f"Question: {sample['Question']}\n\nFinal answer: {sample['Final answer']}"
    doc = Document(
        page_content=content,
        metadata={
            "source": sample['task_id'],
        },
        id=str(uuid4()),
    )
    docs.append(doc)

# Add documents to the vector store
print("Adding documents to the vector store...")
vector_store.add_documents(documents=docs)
del docs
del json_QA