File size: 10,695 Bytes
775521b
 
 
 
 
 
 
105ad74
775521b
 
 
 
8d7feb0
c824142
a2651c0
8d7feb0
d79f98f
775521b
a70d161
 
 
 
 
 
71bfdd5
a2651c0
775521b
 
 
 
 
ee4103c
775521b
 
 
 
 
3125f56
 
 
a2651c0
775521b
a2651c0
 
775521b
 
 
 
 
89eca8d
 
775521b
 
 
 
ee4103c
d42674d
ee4103c
775521b
105ad74
 
 
 
 
 
 
 
 
775521b
 
 
105ad74
775521b
 
105ad74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
775521b
 
 
 
 
105ad74
 
 
 
 
 
 
 
 
 
775521b
 
 
4ab9cb1
 
775521b
 
 
 
 
 
2abd82d
775521b
 
 
 
a873366
775521b
 
 
a873366
 
c5c98c5
 
 
 
a873366
 
01de2b3
71b8e1d
ba0cd2e
 
71b8e1d
 
775521b
2d5f363
01de2b3
98bff2b
01de2b3
775521b
 
2d5f363
0f2023a
2d5f363
0f2023a
2d5f363
 
 
775521b
01de2b3
74ee141
775521b
01de2b3
775521b
cde6f0b
0f2023a
cde6f0b
0f2023a
 
cde6f0b
0f2023a
775521b
01de2b3
74ee141
01de2b3
775521b
 
 
8d7feb0
c824142
 
 
 
 
 
 
775521b
 
 
 
4bbfca6
775521b
 
 
c824142
 
 
 
 
 
 
bb22bf8
 
c824142
775521b
 
 
 
 
4bbfca6
c824142
 
775521b
f9ebed3
bc7d2c2
c738eed
c824142
3125f56
 
c824142
 
 
 
 
 
 
 
 
 
 
775521b
2732917
056b42f
 
 
0d73616
d79f98f
 
775521b
c738eed
775521b
ee4103c
4caf01e
 
 
ee4103c
 
 
9e0fdcc
3125f56
 
 
8d7feb0
d8648b8
8d7feb0
 
 
 
 
775521b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
from langchain.vectorstores import Chroma
from chromadb.api.fastapi import requests
from langchain.schema import Document
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.chroma import ChromaTranslator
from llm.llmFactory import LLMFactory
from datetime import datetime
import baseInfra.dropbox_handler as dbh
from baseInfra.dbInterface import DbInterface
from uuid import UUID
from langchain.text_splitter import RecursiveCharacterTextSplitter
import logging, asyncio

logger=logging.getLogger("root")

class myChromaTranslator(ChromaTranslator):
    allowed_operators = ["$and", "$or"]
    """Subset of allowed logical operators."""
    allowed_comparators = [ "$eq","$ne","$gt","$gte","$lt","$lte",
                           "$contains","$not_contains","$in","$nin"]

class ChromaIntf():
    def __init__(self):
        self.db_interface=DbInterface()

        model_name = "BAAI/bge-large-en-v1.5"
        encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

        self.embedding = HuggingFaceBgeEmbeddings(
                model_name=model_name,
                model_kwargs={'device': 'cpu'},  
                encode_kwargs=encode_kwargs
        )

        self.persist_db_directory = 'db'
        self.persist_docs_directory = "persistence-docs"
        self.logger_file = "persistence.log"
        loop=asyncio.get_event_loop()
        try:
            loop.run_until_complete(dbh.restoreFolder(self.persist_db_directory))
            loop.run_until_complete(dbh.restoreFolder(self.persist_docs_directory))
        except:
            print("Probably folder doesn't exist as it is brand new setup")
        docs = [
            Document(
                page_content="this is test doc",
                metadata={"timestamp":1696743148.474055,"ID":"2000-01-01 15:57:11::664165-test","source":"test"},
                id="2000-01-01 15:57:11::664165-test"
                ),
            ]

        self.vectorstore = Chroma.from_documents(documents=docs,
                                  embedding=self.embedding,
                                  persist_directory=self.persist_db_directory)
        #self.vectorstore._client.

        # timestamp --> time when added
        # source --> notes/references/web/youtube/book/conversation, default conversation
        # title --> of document , will be conversation when source is conversation, default blank
        # author --> will default to blank
        #     "Year": 2024,
        #"Month": 1,
        #"Day": 3,
        #"Hour": 11,
        #"Minute": 29
        self.metadata_field_info = [
            AttributeInfo(
                name="timestamp",
                description="Python datetime.timestamp of the document in isoformat, should not be used for query",
                type="str",
            ),
            AttributeInfo(
                name="Year",
                description="Year from the date when the entry was added in YYYY format",
                type="int",
            ),
            AttributeInfo(
                name="Month",
                description="Month from the date when the entry was added it is from 1-12",
                type="int",
            ),
            AttributeInfo(
                name="Day",
                description="Day of month from the date-time stamp  when the entry was added, it is from 1-31",
                type="int",
            ),
            AttributeInfo(
                name="Hour",
                description="Hour from the timestamp when the entry was added",
                type="int",
            ),
            AttributeInfo(
                name="Minute",
                description="Minute from the timestamp when the entry was added",
                type="int",
            ),
            AttributeInfo(
                name="source",
                description="Type of entry",
                type="string or list[string]",
            ),
            AttributeInfo(
                name="title",
                description="Title or Subject of the entry",
                type="string",
            ),
            AttributeInfo(
                name="author",
                description="Author of the entry",
                type="string",
            )
            ]
        self.document_content_description = "Information to store for retrival from LLM based chatbot"
        lf=LLMFactory()
        #self.llm=lf.get_llm("executor2")
        self.llm=lf.get_llm("executor3")

        self.retriever = SelfQueryRetriever.from_llm(
            self.llm,
            self.vectorstore,
            self.document_content_description,
            self.metadata_field_info,
            structured_query_translator=ChromaTranslator(),
            verbose=True
        )


    async def getRelevantDocs(self,query:str,kwargs:dict):
        """This should also post the result to firebase"""
        print("retriver state",self.retriever.search_kwargs)
        print("retriver state",self.retriever.search_type)
        try:
            for key in kwargs.keys():
                if "search_type" in key:
                    self.retriever.search_type=kwargs[key]
                else:
                    self.retriever.search_kwargs[key]=kwargs[key]
        except:
            print("setting search args failed")
        print("reaching step2")
        try:
            #loop=asyncio.get_event_loop()
            retVal=self.retriever.get_relevant_documents(query)
        except Exception as ex:
            logger.exception("Exception occured:",exc_info=True)
        value=[]
        excludeMeta=True
        print("reaching step3")
        print(str(len(retVal)))
        print("reaching step4")
        try:
            for item in retVal:
                if excludeMeta:
                    v=item.page_content+" \n"
                else:
                    v="Info:"+item.page_content+" "
                    for key in item.metadata.keys():
                        if key != "ID":
                            v+=key+":"+str(item.metadata[key])+" "
                value.append(v)
            print("reaching step5")
            self.db_interface.add_to_cache(input=query,value=value)
        except:
            print("reaching step6")
            for item in retVal:
                if excludeMeta:
                    v=item['page_content']+" \n"
                else:
                    v="Info:"+item['page_content']+" "
                    for key in item['metadata'].keys():
                        if key != "ID":
                            v+=key+":"+str(item['metadata'][key])+" "
                value.append(v)
            print("reaching step7")
            self.db_interface.add_to_cache(input=query,value=value)
        print("reaching step8")
        return retVal
    

    async def addText(self,inStr:str,metadata):
        # metadata expected is some of following
        # timestamp --> time when added
        # source --> notes/references/web/youtube/book/conversation, default conversation
        # title --> of document , will be conversation when source is conversation, default blank
        # author --> will default to blank

        ##TODO: Preprocess inStr to remove any html, markdown tags etc.
        metadata=metadata.dict()
        if "timestamp" not in metadata.keys():
            metadata['timestamp']=datetime.now().isoformat()
        else:
            metadata['timestamp']=datetime.fromisoformat(metadata['timestamp'])
            pass
        if "source" not in metadata.keys():
            metadata['source']="conversation"
        if "title" not in metadata.keys(): 
            metadata["title"] = ""
        if  metadata["source"] == "conversation":
            metadata["title"] == "conversation"
        if "author" not in metadata.keys(): 
            metadata["author"] = ""
        
        #TODO: If url is present in input or when the splitting need to be done, then we'll need to change how we 
        # formulate the ID and may be filename to store information
        metadata['ID']=metadata['timestamp'].strftime("%Y-%m-%d %H-%M-%S")+"-"+metadata['title']
        metadata['Year']=metadata['timestamp'].year
        metadata['Month']=metadata['timestamp'].month
        metadata['Day']=int(metadata['timestamp'].strftime("%d"))
        metadata['Hour']=metadata['timestamp'].hour
        metadata['Minute']=metadata['timestamp'].minute
        metadata['timestamp']=metadata['timestamp'].isoformat()
        print("Metadata is:")
        print(metadata)
        #md.pop("timestamp")
        with open("./docs/"+metadata['ID']+".txt","w") as fd:
            fd.write(inStr)
            print("written to file", inStr)
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=800,
            chunk_overlap=50,
            length_function=len,
            is_separator_regex=False)
        #docs = [    Document(page_content=inStr, metadata=metadata)]
        docs=text_splitter.create_documents([inStr],[metadata])
        partNumber=0
        for doc in docs:
            if partNumber > 0:
                doc.metadata['ID']+=f"__{partNumber}"
            partNumber+=1
            print(f"{partNumber} follows:")
            print(doc)
        try:
            print(metadata['ID'])
            ids=[doc.metadata['ID'] for doc in docs]
            print("ids are:")
            print(ids)
            return await self.vectorstore.aadd_documents(docs,ids=ids)
        except Exception as ex:
            logger.exception("exception in adding",exc_info=True)
            print("inside expect of addText")
            return await self.vectorstore.aadd_documents(docs,ids=[metadata.ID])
        
    async def listDocs(self):
        collection=self.vectorstore._client.get_collection(self.vectorstore._LANGCHAIN_DEFAULT_COLLECTION_NAME,embedding_function=self.embedding)
        return collection.get()
        #return self.vectorstore._client._get(collection_id=self._uuid(collectionInfo.id))
        
        
    async def persist(self):
        self.vectorstore.persist()
        await dbh.backupFile(self.logger_file)
        await dbh.backupFolder(self.persist_db_directory)
        return await dbh.backupFolder(self.persist_docs_directory)
    
    def _uuid(self,uuid_str: str) -> UUID:
        try:
            return UUID(uuid_str)
        except ValueError:
            print("Error generating uuid")
            raise ValueError(f"Could not parse {uuid_str} as a UUID")