chaaim123 commited on
Commit
e8c2a90
·
verified ·
1 Parent(s): fb4cb2b

Create retriever/vector_store_manager.py

Browse files
Files changed (1) hide show
  1. retriever/vector_store_manager.py +98 -0
retriever/vector_store_manager.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from config.config import ConfigConstants
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ from langchain_community.vectorstores import FAISS
6
+
7
+ class VectorStoreManager:
8
+ def __init__(self, embedding_path="embeddings.faiss"):
9
+ """
10
+ Initialize the vector store manager.
11
+
12
+ Args:
13
+ embedding_path (str): Path to save/load the FAISS index.
14
+ """
15
+ self.embedding_path = embedding_path
16
+ self.embedding_model = HuggingFaceEmbeddings(model_name=ConfigConstants.EMBEDDING_MODEL_NAME)
17
+ self.vector_store = self._initialize_vector_store()
18
+
19
+ def _initialize_vector_store(self):
20
+ """Initialize or load the FAISS vector store."""
21
+ if os.path.exists(self.embedding_path):
22
+ logging.info("Loading embeddings from local file")
23
+ return FAISS.load_local(
24
+ self.embedding_path,
25
+ self.embedding_model,
26
+ allow_dangerous_deserialization=True
27
+ )
28
+ else:
29
+ '''logging.info("Creating new vector store")
30
+ # Return an empty vector store; it will be populated when documents are added
31
+ return FAISS.from_texts(
32
+ texts=[""], # Dummy text to initialize
33
+ embedding=self.embedding_model,
34
+ metadatas=[{"source": "init", "doc_id": "init"}]
35
+ )'''
36
+ logging.info("Creating new vector store (unpopulated)")
37
+ return None
38
+
39
+ def add_documents(self, documents):
40
+ """
41
+ Add new documents to the vector store and save it.
42
+
43
+ Args:
44
+ documents (list): List of dictionaries with 'text', 'source', and 'doc_id'.
45
+ """
46
+ if not documents:
47
+ return
48
+
49
+ texts = [doc['text'] for doc in documents]
50
+ metadatas = [{'source': doc['source'], 'doc_id': doc['doc_id']} for doc in documents]
51
+
52
+ logging.info("Adding new documents to vector store")
53
+
54
+ if not self.vector_store:
55
+ self.vector_store = FAISS.from_texts(
56
+ texts=texts,
57
+ embedding=self.embedding_model,
58
+ metadatas=metadatas
59
+ )
60
+ else:
61
+ self.vector_store.add_texts(texts=texts, metadatas=metadatas)
62
+
63
+ self.vector_store.save_local(self.embedding_path)
64
+ logging.info(f"Vector store updated and saved to {self.embedding_path}")
65
+
66
+ def search(self, query, doc_id, k=10):
67
+ """
68
+ Search the vector store for relevant chunks, filtered by doc_id.
69
+
70
+ Args:
71
+ query (str): The user's query.
72
+ doc_id (str): The document ID to filter by.
73
+ k (int): Number of results to return.
74
+
75
+ Returns:
76
+ list: List of relevant document chunks with metadata and scores.
77
+ """
78
+ if not self.vector_store:
79
+ return []
80
+
81
+ try:
82
+ query = " ".join(query.lower().split())
83
+ # Define a filter function to match doc_id
84
+ filter_fn = lambda metadata: metadata['doc_id'] == doc_id
85
+
86
+ # Perform similarity search with filter
87
+ results = self.vector_store.similarity_search_with_score(
88
+ query=query,
89
+ k=k,
90
+ filter=filter_fn
91
+ )
92
+
93
+ # Format results
94
+ return [{'text': doc.page_content, 'metadata': doc.metadata, 'score': score} for doc, score in results]
95
+
96
+ except Exception as e:
97
+ logging.error(f"Error during vector store search: {str(e)}")
98
+ return []