Spaces:
Running
Running
File size: 1,355 Bytes
4363820 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# src/retrieval/vector_store.py
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
import numpy as np
from typing import List, Dict, Any
class VectorStore:
def __init__(self, embedding_function):
self.embedding_function = embedding_function
self.collection = None
def add_documents(self, documents: List[str], embeddings: List[np.ndarray], ids: List[str]):
langchain_docs = [Document(page_content=doc, metadata={"id": id}) for doc, id in zip(documents, ids)]
self.collection = Chroma.from_documents(
documents=langchain_docs,
embedding=self.embedding_function,
ids=ids,
persist_directory="./chroma_db"
)
self.collection.persist()
def query(self, query_embedding: np.ndarray, top_k: int = 5) -> Dict[str, Any]:
results = self.collection.similarity_search_by_vector(
embedding=query_embedding,
k=top_k
)
ids = [doc.metadata["id"] for doc in results]
distances = [1 - np.dot(query_embedding, doc.vector) / (np.linalg.norm(query_embedding) * np.linalg.norm(doc.vector))
if hasattr(doc, "vector") else 1.0 for doc in results]
return {
"ids": [ids],
"distances": [distances]
} |