anubhav77 commited on
Commit
775521b
·
1 Parent(s): 6a3f029
Files changed (3) hide show
  1. src/chromaIntf.py +121 -0
  2. src/chroma_intf.py +0 -175
  3. src/main.py +6 -4
src/chromaIntf.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.vectorstores import Chroma
2
+ from chromadb.api.fastapi import requests
3
+ from langchain.schema import Document
4
+ from langchain.chains import RetrievalQA
5
+ from langchain.embeddings import HuggingFaceBgeEmbeddings
6
+ from langchain.retrievers.self_query.base import SelfQueryRetriever
7
+ from langchain.chains.query_constructor.base import AttributeInfo
8
+ from llm.llmFactory import LLMFactory
9
+ from datetime import datetime
10
+ import baseInfra.dropbox_handler as dbh
11
+ from baseInfra.dbInterface import DbInterface
12
+
13
+ class ChromeIntf():
14
+ def __init__(self):
15
+ self.db_interface=DbInterface()
16
+
17
+ model_name = "BAAI/bge-large-en-v1.5"
18
+ encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
19
+
20
+ embedding = HuggingFaceBgeEmbeddings(
21
+ model_name=model_name,
22
+ model_kwargs={'device': 'cpu'},
23
+ encode_kwargs=encode_kwargs
24
+ )
25
+
26
+ persist_directory = 'db'
27
+ try:
28
+ dbh.restoreFolder("db")
29
+ except:
30
+ print("Probably folder doesn't exist as it is brand new setup")
31
+ docs = [
32
+ Document(
33
+ page_content="this is test doc",
34
+ metadata={"timestamp":1696743148.474055,"ID":"test","source":"test"},
35
+ ),
36
+ ]
37
+
38
+ self.vectorstore = Chroma.from_documents(documents=docs,
39
+ embedding=embedding,
40
+ persist_directory=persist_directory)
41
+
42
+ self.metadata_field_info = [
43
+ AttributeInfo(
44
+ name="timestamp",
45
+ description="Python datetime.timestamp of the document in isoformat, can be used for getting date, year, month, time etc ",
46
+ type="str",
47
+ ),
48
+ AttributeInfo(
49
+ name="source",
50
+ description="Type of entry",
51
+ type="string or list[string]",
52
+ ),
53
+ ]
54
+ self.document_content_description = "Information to store for retrival from LLM based chatbot"
55
+ lf=LLMFactory()
56
+ self.llm=lf.get_llm("executor2")
57
+
58
+ self.retriever = SelfQueryRetriever.from_llm(
59
+ self.llm,
60
+ self.vectorstore,
61
+ self.document_content_description,
62
+ self.metadata_field_info,
63
+ verbose=True
64
+ )
65
+
66
+
67
+ def getRelevantDocs(self,query:str,count:int=8):
68
+ """This should also post the result to firebase"""
69
+ print("retriver state",self.retriever.search_kwargs)
70
+ print("retriver state",self.retriever.search_type)
71
+ self.retriever.search_kwargs["k"]=count
72
+ retVal=self.retriever.get_relevant_documents(query)
73
+ value=[]
74
+ try:
75
+ for item in retVal:
76
+ v="Info:"+item['page_content']+" "
77
+ for key in item.metadata.keys():
78
+ if key != "ID":
79
+ v+=key+":"+str(item.metadata[key])+" "
80
+ value.append(v)
81
+ self.db_interface.add_to_cache(input=query,value=value)
82
+ except:
83
+ for item in retVal:
84
+ v="Info:"+item.page_content+" "
85
+ for key in item.metadata.keys():
86
+ if key != "ID":
87
+ v+=key+":"+str(item.metadata[key])+" "
88
+ value.append(v)
89
+ self.db_interface.add_to_cache(input=query,value=value)
90
+ return retVal
91
+
92
+
93
+ def addText(self,inStr:str,metadata):
94
+ metadata=metadata.dict()
95
+ if "timestamp" not in metadata.keys():
96
+ metadata['timestamp']=datetime.now().isoformat()
97
+ else:
98
+ metadata['timestamp']=datetime.fromisoformat(metadata['timestamp'])
99
+ pass
100
+ if "source" not in metadata.keys():
101
+ metadata['source']="conversation"
102
+ metadata['ID']=metadata['timestamp'].strftime("%Y-%m-%d %H:%M:%S::%f")+"-conversation"
103
+ metadata['Year']=metadata['timestamp'].year
104
+ metadata['Month']=metadata['timestamp'].month
105
+ metadata['Day']=int(metadata['timestamp'].strftime("%d"))
106
+ metadata['Hour']=metadata['timestamp'].hour
107
+ metadata['Minute']=metadata['timestamp'].minute
108
+ #md.pop("timestamp")
109
+
110
+ docs = [
111
+ Document(page_content=inStr, metadata=metadata)]
112
+ try:
113
+ return self.vectorstore.add_documents(docs,ids=[metadata.ID])
114
+ except:
115
+ print("inside expect of addText")
116
+ return self.vectorstore.add_documents(docs,ids=[metadata['ID']])
117
+
118
+ def persist(self):
119
+ self.vectorstore.persist()
120
+
121
+
src/chroma_intf.py DELETED
@@ -1,175 +0,0 @@
1
- from langchain.vectorstores import Chroma
2
- from chromadb.api.fastapi import requests
3
- from langchain.schema import Document
4
- from langchain.chains import RetrievalQA
5
- from langchain.embeddings import HuggingFaceBgeEmbeddings
6
- from langchain.retrievers.self_query.base import SelfQueryRetriever
7
- from langchain.chains.query_constructor.base import AttributeInfo
8
- from llm.llmFactory import LLMFactory
9
- from datetime import datetime
10
- import baseInfra.dropbox_handler as dbh
11
- from baseInfra.dbInterface import DbInterface
12
-
13
- db_interface=DbInterface()
14
-
15
- model_name = "BAAI/bge-large-en-v1.5"
16
- encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
17
-
18
- embedding = HuggingFaceBgeEmbeddings(
19
- model_name=model_name,
20
- model_kwargs={'device': 'cpu'},
21
- encode_kwargs=encode_kwargs
22
- )
23
-
24
- persist_directory = 'db'
25
- try:
26
- dbh.restoreFolder("db")
27
- except:
28
- print("Probably folder doesn't exist as it is brand new setup")
29
- docs = [
30
- Document(
31
- page_content="Complex, layered, rich red with dark fruit flavors",
32
- metadata={"name":"Opus One", "year": 2018, "rating": 96, "grape": "Cabernet Sauvignon", "color":"red", "country":"USA"},
33
- ),
34
- Document(
35
- page_content="Luxurious, sweet wine with flavors of honey, apricot, and peach",
36
- metadata={"name":"Château d'Yquem", "year": 2015, "rating": 98, "grape": "Sémillon", "color":"white", "country":"France"},
37
- ),
38
- Document(
39
- page_content="Full-bodied red with notes of black fruit and spice",
40
- metadata={"name":"Penfolds Grange", "year": 2017, "rating": 97, "grape": "Shiraz", "color":"red", "country":"Australia"},
41
- ),
42
- Document(
43
- page_content="Elegant, balanced red with herbal and berry nuances",
44
- metadata={"name":"Sassicaia", "year": 2016, "rating": 95, "grape": "Cabernet Franc", "color":"red", "country":"Italy"},
45
- ),
46
- Document(
47
- page_content="Highly sought-after Pinot Noir with red fruit and earthy notes",
48
- metadata={"name":"Domaine de la Romanée-Conti", "year": 2018, "rating": 100, "grape": "Pinot Noir", "color":"red", "country":"France"},
49
- ),
50
- Document(
51
- page_content="Crisp white with tropical fruit and citrus flavors",
52
- metadata={"name":"Cloudy Bay", "year": 2021, "rating": 92, "grape": "Sauvignon Blanc", "color":"white", "country":"New Zealand"},
53
- ),
54
- Document(
55
- page_content="Rich, complex Champagne with notes of brioche and citrus",
56
- metadata={"name":"Krug Grande Cuvée", "year": 2010, "rating": 93, "grape": "Chardonnay blend", "color":"sparkling", "country":"New Zealand"},
57
- ),
58
- Document(
59
- page_content="Intense, dark fruit flavors with hints of chocolate",
60
- metadata={"name":"Caymus Special Selection", "year": 2018, "rating": 96, "grape": "Cabernet Sauvignon", "color":"red", "country":"USA"},
61
- ),
62
- Document(
63
- page_content="Exotic, aromatic white with stone fruit and floral notes",
64
- metadata={"name":"Jermann Vintage Tunina", "year": 2020, "rating": 91, "grape": "Sauvignon Blanc blend", "color":"white", "country":"Italy"},
65
- ),
66
- ]
67
-
68
- vectorstore = Chroma.from_documents(documents=docs,
69
- embedding=embedding,
70
- persist_directory=persist_directory)
71
-
72
- metadata_field_info = [
73
- AttributeInfo(
74
- name="grape",
75
- description="The grape used to make the wine",
76
- type="string or list[string]",
77
- ),
78
- AttributeInfo(
79
- name="name",
80
- description="The name of the wine",
81
- type="string or list[string]",
82
- ),
83
- AttributeInfo(
84
- name="color",
85
- description="The color of the wine",
86
- type="string or list[string]",
87
- ),
88
- AttributeInfo(
89
- name="year",
90
- description="The year the wine was released",
91
- type="integer",
92
- ),
93
- AttributeInfo(
94
- name="country",
95
- description="The name of the country the wine comes from",
96
- type="string",
97
- ),
98
- AttributeInfo(
99
- name="rating", description="The Robert Parker rating for the wine 0-100", type="integer" #float
100
- ),
101
- ]
102
- document_content_description = "Brief description of the wine"
103
- lf=LLMFactory()
104
- llm=lf.get_llm("executor2")
105
-
106
- retriever = SelfQueryRetriever.from_llm(
107
- llm,
108
- vectorstore,
109
- document_content_description,
110
- metadata_field_info,
111
- set_limit=True,
112
- verbose=True
113
- )
114
-
115
- meta_defaults={
116
- "timestamp":datetime.now().strftime("%Y-%m-%d %H:%M:%S::%f"),
117
- "source":"conversation",
118
- "ID":datetime.now().strftime("%Y-%m-%d %H:%M:%S::%f")+"-conversation"
119
- }
120
-
121
- def getRelevantDocs(query:str,count:int=8):
122
- """This should also post the result to firebase"""
123
- print("retriver state",retriever.search_kwargs)
124
- print("retriver state",retriever.search_type)
125
- retriever.search_kwargs["k"]=count
126
- retVal=retriever.get_relevant_documents(query)
127
- value=[]
128
- try:
129
- for item in retVal:
130
- v="Info:"+item['page_content']+" "
131
- for key in item.metadata.keys():
132
- if key != "ID":
133
- v+=key+":"+str(item.metadata[key])+" "
134
- value.append(v)
135
- db_interface.add_to_cache(input=query,value=value)
136
- except:
137
- for item in retVal:
138
- v="Info:"+item.page_content+" "
139
- for key in item.metadata.keys():
140
- if key != "ID":
141
- v+=key+":"+str(item.metadata[key])+" "
142
- value.append(v)
143
- db_interface.add_to_cache(input=query,value=value)
144
- return retVal
145
-
146
-
147
- def addText(inStr:str,metadata):
148
- md=meta_defaults
149
- metadata=metadata.dict()
150
- for key in metadata.keys():
151
- md[key]=metadata[key]
152
- if "timestamp" not in metadata.keys():
153
- md['timestamp']=datetime.now()
154
- else:
155
- md['timestamp']=datetime.fromisoformat(md['timestamp'])
156
- md['ID']=md['timestamp'].strftime("%Y-%m-%d %H:%M:%S::%f")+"-conversation"
157
- md['Year']=md['timestamp'].year
158
- md['Month']=md['timestamp'].month
159
- md['Day']=int(md['timestamp'].strftime("%d"))
160
- md['Hour']=md['timestamp'].hour
161
- md['Minute']=md['timestamp'].minute
162
- #md.pop("timestamp")
163
-
164
- docs = [
165
- Document(page_content=inStr, metadata=md)]
166
- try:
167
- return vectorstore.add_documents(docs,ids=[md.ID])
168
- except:
169
- print("inside expect of addText")
170
- return vectorstore.add_documents(docs,ids=[md['ID']])
171
-
172
- def persist():
173
- vectorstore.persist()
174
-
175
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/main.py CHANGED
@@ -13,10 +13,12 @@ from starlette.requests import Request
13
  from pydantic import BaseModel, Extra
14
  from enum import Enum
15
  from typing import List, Dict, Any, Generator, Optional, cast, Callable
16
- from chroma_intf import *
17
  import baseInfra.dropbox_handler as dbh
18
  import traceback
19
 
 
 
20
  class PathRequest(BaseModel):
21
  dir: str = "/"
22
 
@@ -64,7 +66,7 @@ async def get_matching_docs(inStr: str ) -> Any:
64
  TODO: Add parameter for type of query and number of docs to return
65
  TODO: Add parameter to return the source information as well
66
  """
67
- return getRelevantDocs(inStr)
68
 
69
  @app.post(api_base+"/addTextDocument")
70
  async def add_text_document(inDoc: DocWithMeta ) -> Any:
@@ -73,11 +75,11 @@ async def add_text_document(inDoc: DocWithMeta ) -> Any:
73
  """
74
  print("Received request for")
75
  print(inDoc)
76
- return addText(inDoc.text,inDoc.metadata)
77
 
78
  @app.get(api_base+"/persist")
79
  async def persist_db():
80
- persist()
81
  return await dbh.backupFolder("db")
82
 
83
  @app.get(api_base+"/reset")
 
13
  from pydantic import BaseModel, Extra
14
  from enum import Enum
15
  from typing import List, Dict, Any, Generator, Optional, cast, Callable
16
+ from chromaIntf import ChromaIntf
17
  import baseInfra.dropbox_handler as dbh
18
  import traceback
19
 
20
+ chromaIntf=ChromaIntf()
21
+
22
  class PathRequest(BaseModel):
23
  dir: str = "/"
24
 
 
66
  TODO: Add parameter for type of query and number of docs to return
67
  TODO: Add parameter to return the source information as well
68
  """
69
+ return chromaIntf.getRelevantDocs(inStr)
70
 
71
  @app.post(api_base+"/addTextDocument")
72
  async def add_text_document(inDoc: DocWithMeta ) -> Any:
 
75
  """
76
  print("Received request for")
77
  print(inDoc)
78
+ return chromaIntf.addText(inDoc.text,inDoc.metadata)
79
 
80
  @app.get(api_base+"/persist")
81
  async def persist_db():
82
+ chromaIntf.persist()
83
  return await dbh.backupFolder("db")
84
 
85
  @app.get(api_base+"/reset")