Spaces:

anubhav77
/

maya-persistence

Runtime error

App Files Files Community

anubhav77 commited on Jan 6, 2024

Commit

c824142

1 Parent(s): d462b73

v0.1.4

Browse files

Files changed (1) hide show

src/chromaIntf.py +32 -4

src/chromaIntf.py CHANGED Viewed

@@ -10,6 +10,7 @@ from datetime import datetime
 import baseInfra.dropbox_handler as dbh
 from baseInfra.dbInterface import DbInterface
 from uuid import UUID
 class ChromaIntf():
@@ -120,6 +121,13 @@ class ChromaIntf():
     async def addText(self,inStr:str,metadata):
         metadata=metadata.dict()
         if "timestamp" not in metadata.keys():
             metadata['timestamp']=datetime.now().isoformat()
@@ -128,22 +136,42 @@ class ChromaIntf():
             pass
         if "source" not in metadata.keys():
             metadata['source']="conversation"
         #TODO: If url is present in input or when the splitting need to be done, then we'll need to change how we
         # formulate the ID and may be filename to store information
-        metadata['ID']=metadata['timestamp'].strftime("%Y-%m-%d %H-%M-%S")+"-"+metadata['source']
         metadata['Year']=metadata['timestamp'].year
         metadata['Month']=metadata['timestamp'].month
         metadata['Day']=int(metadata['timestamp'].strftime("%d"))
         metadata['Hour']=metadata['timestamp'].hour
         metadata['Minute']=metadata['timestamp'].minute
         metadata['timestamp']=metadata['timestamp'].isoformat()
         #md.pop("timestamp")
-        docs = [
-            Document(page_content=inStr, metadata=metadata)]
         with open("./docs/"+metadata['ID']+".txt","w") as fd:
             fd.write(inStr)
             print("written to file", inStr)
         try:
             return await self.vectorstore.aadd_documents(docs,ids=[metadata['ID']])
         except:

 import baseInfra.dropbox_handler as dbh
 from baseInfra.dbInterface import DbInterface
 from uuid import UUID
+from langchain.text_splitter import RecursiveCharacterTextSplitter
 class ChromaIntf():
     async def addText(self,inStr:str,metadata):
+        # metadata expected is some of following
+        # timestamp --> time when added
+        # source --> notes/references/web/youtube/book/conversation, default conversation
+        # title --> of document , will be conversation when source is conversation, default blank
+        # author --> will default to blank
+        ##TODO: Preprocess inStr to remove any html, markdown tags etc.
         metadata=metadata.dict()
         if "timestamp" not in metadata.keys():
             metadata['timestamp']=datetime.now().isoformat()
             pass
         if "source" not in metadata.keys():
             metadata['source']="conversation"
+        if "title" not in metadata.keys():
+            metadata["title"] = ""
+        if  metadata["source"] == "conversation":
+            metadata["title"] == "conversation"
+        if "author" not in metadata.keys():
+            metadata["author"] = ""
         #TODO: If url is present in input or when the splitting need to be done, then we'll need to change how we
         # formulate the ID and may be filename to store information
+        metadata['ID']=metadata['timestamp'].strftime("%Y-%m-%d %H-%M-%S")+"-"+metadata['title']
         metadata['Year']=metadata['timestamp'].year
         metadata['Month']=metadata['timestamp'].month
         metadata['Day']=int(metadata['timestamp'].strftime("%d"))
         metadata['Hour']=metadata['timestamp'].hour
         metadata['Minute']=metadata['timestamp'].minute
         metadata['timestamp']=metadata['timestamp'].isoformat()
+        print("Metadata is:")
+        print(metadata)
         #md.pop("timestamp")
         with open("./docs/"+metadata['ID']+".txt","w") as fd:
             fd.write(inStr)
             print("written to file", inStr)
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=350,
+            chunk_overlap=30,
+            length_function=len,
+            is_separator_regex=False)
+        #docs = [    Document(page_content=inStr, metadata=metadata)]
+        docs=text_splitter.create_documents([inStr],[metadata])
+        partNumber=0
+        for doc in docs:
+            if partNumber > 0:
+                doc.metadata['ID']+=f"__{partNumber}"
+            partNumber+=1
+            print(f"{partNumber} follows:")
+            print(doc)
         try:
             return await self.vectorstore.aadd_documents(docs,ids=[metadata['ID']])
         except: