Spaces:

anubhav77
/

maya-persistence

Runtime error

App Files Files Community

anubhav77 commited on Oct 8, 2023

Commit

775521b

1 Parent(s): 6a3f029

v0.8.5

Browse files

Files changed (3) hide show

src/chromaIntf.py +121 -0
src/chroma_intf.py +0 -175
src/main.py +6 -4

src/chromaIntf.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from langchain.vectorstores import Chroma
+from chromadb.api.fastapi import requests
+from langchain.schema import Document
+from langchain.chains import RetrievalQA
+from langchain.embeddings import HuggingFaceBgeEmbeddings
+from langchain.retrievers.self_query.base import SelfQueryRetriever
+from langchain.chains.query_constructor.base import AttributeInfo
+from llm.llmFactory import LLMFactory
+from datetime import datetime
+import baseInfra.dropbox_handler as dbh
+from baseInfra.dbInterface import DbInterface
+class ChromeIntf():
+    def __init__(self):
+        self.db_interface=DbInterface()
+        model_name = "BAAI/bge-large-en-v1.5"
+        encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
+        embedding = HuggingFaceBgeEmbeddings(
+                model_name=model_name,
+                model_kwargs={'device': 'cpu'},
+                encode_kwargs=encode_kwargs
+        )
+        persist_directory = 'db'
+        try:
+            dbh.restoreFolder("db")
+        except:
+            print("Probably folder doesn't exist as it is brand new setup")
+        docs = [
+            Document(
+                page_content="this is test doc",
+                metadata={"timestamp":1696743148.474055,"ID":"test","source":"test"},
+                ),
+            ]
+        self.vectorstore = Chroma.from_documents(documents=docs,
+                                  embedding=embedding,
+                                  persist_directory=persist_directory)
+        self.metadata_field_info = [
+            AttributeInfo(
+                name="timestamp",
+                description="Python datetime.timestamp of the document in isoformat, can be used for getting date, year, month, time etc ",
+                type="str",
+            ),
+            AttributeInfo(
+                name="source",
+                description="Type of entry",
+                type="string or list[string]",
+            ),
+            ]
+        self.document_content_description = "Information to store for retrival from LLM based chatbot"
+        lf=LLMFactory()
+        self.llm=lf.get_llm("executor2")
+        self.retriever = SelfQueryRetriever.from_llm(
+            self.llm,
+            self.vectorstore,
+            self.document_content_description,
+            self.metadata_field_info,
+            verbose=True
+        )
+    def getRelevantDocs(self,query:str,count:int=8):
+        """This should also post the result to firebase"""
+        print("retriver state",self.retriever.search_kwargs)
+        print("retriver state",self.retriever.search_type)
+        self.retriever.search_kwargs["k"]=count
+        retVal=self.retriever.get_relevant_documents(query)
+        value=[]
+        try:
+            for item in retVal:
+                v="Info:"+item['page_content']+" "
+                for key in item.metadata.keys():
+                    if key != "ID":
+                        v+=key+":"+str(item.metadata[key])+" "
+                value.append(v)
+            self.db_interface.add_to_cache(input=query,value=value)
+        except:
+            for item in retVal:
+                v="Info:"+item.page_content+" "
+                for key in item.metadata.keys():
+                    if key != "ID":
+                        v+=key+":"+str(item.metadata[key])+" "
+                value.append(v)
+            self.db_interface.add_to_cache(input=query,value=value)
+        return retVal
+    def addText(self,inStr:str,metadata):
+        metadata=metadata.dict()
+        if "timestamp" not in metadata.keys():
+            metadata['timestamp']=datetime.now().isoformat()
+        else:
+            metadata['timestamp']=datetime.fromisoformat(metadata['timestamp'])
+            pass
+        if "source" not in metadata.keys():
+            metadata['source']="conversation"
+        metadata['ID']=metadata['timestamp'].strftime("%Y-%m-%d %H:%M:%S::%f")+"-conversation"
+        metadata['Year']=metadata['timestamp'].year
+        metadata['Month']=metadata['timestamp'].month
+        metadata['Day']=int(metadata['timestamp'].strftime("%d"))
+        metadata['Hour']=metadata['timestamp'].hour
+        metadata['Minute']=metadata['timestamp'].minute
+        #md.pop("timestamp")
+        docs = [
+            Document(page_content=inStr, metadata=metadata)]
+        try:
+            return self.vectorstore.add_documents(docs,ids=[metadata.ID])
+        except:
+            print("inside expect of addText")
+            return self.vectorstore.add_documents(docs,ids=[metadata['ID']])
+    def persist(self):
+        self.vectorstore.persist()

src/chroma_intf.py DELETED Viewed

@@ -1,175 +0,0 @@
-from langchain.vectorstores import Chroma
-from chromadb.api.fastapi import requests
-from langchain.schema import Document
-from langchain.chains import RetrievalQA
-from langchain.embeddings import HuggingFaceBgeEmbeddings
-from langchain.retrievers.self_query.base import SelfQueryRetriever
-from langchain.chains.query_constructor.base import AttributeInfo
-from llm.llmFactory import LLMFactory
-from datetime import datetime
-import baseInfra.dropbox_handler as dbh
-from baseInfra.dbInterface import DbInterface
-db_interface=DbInterface()
-model_name = "BAAI/bge-large-en-v1.5"
-encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
-embedding = HuggingFaceBgeEmbeddings(
-    model_name=model_name,
-    model_kwargs={'device': 'cpu'},
-    encode_kwargs=encode_kwargs
-)
-persist_directory = 'db'
-try:
-    dbh.restoreFolder("db")
-except:
-    print("Probably folder doesn't exist as it is brand new setup")
-docs = [
-    Document(
-        page_content="Complex, layered, rich red with dark fruit flavors",
-        metadata={"name":"Opus One", "year": 2018, "rating": 96, "grape": "Cabernet Sauvignon", "color":"red", "country":"USA"},
-    ),
-    Document(
-        page_content="Luxurious, sweet wine with flavors of honey, apricot, and peach",
-        metadata={"name":"Château d'Yquem", "year": 2015, "rating": 98, "grape": "Sémillon", "color":"white", "country":"France"},
-    ),
-    Document(
-        page_content="Full-bodied red with notes of black fruit and spice",
-        metadata={"name":"Penfolds Grange", "year": 2017, "rating": 97, "grape": "Shiraz", "color":"red", "country":"Australia"},
-    ),
-    Document(
-        page_content="Elegant, balanced red with herbal and berry nuances",
-        metadata={"name":"Sassicaia", "year": 2016, "rating": 95, "grape": "Cabernet Franc", "color":"red", "country":"Italy"},
-    ),
-    Document(
-        page_content="Highly sought-after Pinot Noir with red fruit and earthy notes",
-        metadata={"name":"Domaine de la Romanée-Conti", "year": 2018, "rating": 100, "grape": "Pinot Noir", "color":"red", "country":"France"},
-    ),
-    Document(
-        page_content="Crisp white with tropical fruit and citrus flavors",
-        metadata={"name":"Cloudy Bay", "year": 2021, "rating": 92, "grape": "Sauvignon Blanc", "color":"white", "country":"New Zealand"},
-    ),
-    Document(
-        page_content="Rich, complex Champagne with notes of brioche and citrus",
-        metadata={"name":"Krug Grande Cuvée", "year": 2010, "rating": 93, "grape": "Chardonnay blend", "color":"sparkling", "country":"New Zealand"},
-    ),
-    Document(
-        page_content="Intense, dark fruit flavors with hints of chocolate",
-        metadata={"name":"Caymus Special Selection", "year": 2018, "rating": 96, "grape": "Cabernet Sauvignon", "color":"red", "country":"USA"},
-    ),
-    Document(
-        page_content="Exotic, aromatic white with stone fruit and floral notes",
-        metadata={"name":"Jermann Vintage Tunina", "year": 2020, "rating": 91, "grape": "Sauvignon Blanc blend", "color":"white", "country":"Italy"},
-    ),
-]
-vectorstore = Chroma.from_documents(documents=docs,
-                                  embedding=embedding,
-                                  persist_directory=persist_directory)
-metadata_field_info = [
-    AttributeInfo(
-        name="grape",
-        description="The grape used to make the wine",
-        type="string or list[string]",
-    ),
-    AttributeInfo(
-        name="name",
-        description="The name of the wine",
-        type="string or list[string]",
-    ),
-    AttributeInfo(
-        name="color",
-        description="The color of the wine",
-        type="string or list[string]",
-    ),
-    AttributeInfo(
-        name="year",
-        description="The year the wine was released",
-        type="integer",
-    ),
-    AttributeInfo(
-        name="country",
-        description="The name of the country the wine comes from",
-        type="string",
-    ),
-    AttributeInfo(
-        name="rating", description="The Robert Parker rating for the wine 0-100", type="integer" #float
-    ),
-]
-document_content_description = "Brief description of the wine"
-lf=LLMFactory()
-llm=lf.get_llm("executor2")
-retriever = SelfQueryRetriever.from_llm(
-    llm,
-    vectorstore,
-    document_content_description,
-    metadata_field_info,
-    set_limit=True,
-    verbose=True
-)
-meta_defaults={
-    "timestamp":datetime.now().strftime("%Y-%m-%d %H:%M:%S::%f"),
-    "source":"conversation",
-    "ID":datetime.now().strftime("%Y-%m-%d %H:%M:%S::%f")+"-conversation"
-}
-def getRelevantDocs(query:str,count:int=8):
-    """This should also post the result to firebase"""
-    print("retriver state",retriever.search_kwargs)
-    print("retriver state",retriever.search_type)
-    retriever.search_kwargs["k"]=count
-    retVal=retriever.get_relevant_documents(query)
-    value=[]
-    try:
-        for item in retVal:
-            v="Info:"+item['page_content']+" "
-            for key in item.metadata.keys():
-                if key != "ID":
-                    v+=key+":"+str(item.metadata[key])+" "
-            value.append(v)
-        db_interface.add_to_cache(input=query,value=value)
-    except:
-        for item in retVal:
-            v="Info:"+item.page_content+" "
-            for key in item.metadata.keys():
-                if key != "ID":
-                    v+=key+":"+str(item.metadata[key])+" "
-            value.append(v)
-        db_interface.add_to_cache(input=query,value=value)
-    return retVal
-def addText(inStr:str,metadata):
-    md=meta_defaults
-    metadata=metadata.dict()
-    for key in metadata.keys():
-        md[key]=metadata[key]
-    if "timestamp" not in metadata.keys():
-        md['timestamp']=datetime.now()
-    else:
-        md['timestamp']=datetime.fromisoformat(md['timestamp'])
-    md['ID']=md['timestamp'].strftime("%Y-%m-%d %H:%M:%S::%f")+"-conversation"
-    md['Year']=md['timestamp'].year
-    md['Month']=md['timestamp'].month
-    md['Day']=int(md['timestamp'].strftime("%d"))
-    md['Hour']=md['timestamp'].hour
-    md['Minute']=md['timestamp'].minute
-    #md.pop("timestamp")
-    docs = [
-        Document(page_content=inStr, metadata=md)]
-    try:
-        return vectorstore.add_documents(docs,ids=[md.ID])
-    except:
-        print("inside expect of addText")
-        return vectorstore.add_documents(docs,ids=[md['ID']])
-def persist():
-    vectorstore.persist()

src/main.py CHANGED Viewed

@@ -13,10 +13,12 @@ from starlette.requests import Request
 from pydantic import BaseModel, Extra
 from enum import Enum
 from typing import List, Dict, Any, Generator, Optional, cast, Callable
-from chroma_intf import *
 import baseInfra.dropbox_handler as dbh
 import traceback
 class PathRequest(BaseModel):
     dir: str = "/"
@@ -64,7 +66,7 @@ async def get_matching_docs(inStr: str ) -> Any:
     TODO: Add parameter for type of query and number of docs to return
     TODO: Add parameter to return the source information as well
     """
-    return getRelevantDocs(inStr)
 @app.post(api_base+"/addTextDocument")
 async def add_text_document(inDoc: DocWithMeta ) -> Any:
@@ -73,11 +75,11 @@ async def add_text_document(inDoc: DocWithMeta ) -> Any:
     """
     print("Received request for")
     print(inDoc)
-    return addText(inDoc.text,inDoc.metadata)
 @app.get(api_base+"/persist")
 async def persist_db():
-    persist()
     return await dbh.backupFolder("db")
 @app.get(api_base+"/reset")

 from pydantic import BaseModel, Extra
 from enum import Enum
 from typing import List, Dict, Any, Generator, Optional, cast, Callable
+from chromaIntf import ChromaIntf
 import baseInfra.dropbox_handler as dbh
 import traceback
+chromaIntf=ChromaIntf()
 class PathRequest(BaseModel):
     dir: str = "/"
     TODO: Add parameter for type of query and number of docs to return
     TODO: Add parameter to return the source information as well
     """
+    return chromaIntf.getRelevantDocs(inStr)
 @app.post(api_base+"/addTextDocument")
 async def add_text_document(inDoc: DocWithMeta ) -> Any:
     """
     print("Received request for")
     print(inDoc)
+    return chromaIntf.addText(inDoc.text,inDoc.metadata)
 @app.get(api_base+"/persist")
 async def persist_db():
+    chromaIntf.persist()
     return await dbh.backupFolder("db")
 @app.get(api_base+"/reset")