Spaces:
Runtime error
Runtime error
v0.1.4
Browse files- src/chromaIntf.py +32 -4
src/chromaIntf.py
CHANGED
@@ -10,6 +10,7 @@ from datetime import datetime
|
|
10 |
import baseInfra.dropbox_handler as dbh
|
11 |
from baseInfra.dbInterface import DbInterface
|
12 |
from uuid import UUID
|
|
|
13 |
|
14 |
|
15 |
class ChromaIntf():
|
@@ -120,6 +121,13 @@ class ChromaIntf():
|
|
120 |
|
121 |
|
122 |
async def addText(self,inStr:str,metadata):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
metadata=metadata.dict()
|
124 |
if "timestamp" not in metadata.keys():
|
125 |
metadata['timestamp']=datetime.now().isoformat()
|
@@ -128,22 +136,42 @@ class ChromaIntf():
|
|
128 |
pass
|
129 |
if "source" not in metadata.keys():
|
130 |
metadata['source']="conversation"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
#TODO: If url is present in input or when the splitting need to be done, then we'll need to change how we
|
132 |
# formulate the ID and may be filename to store information
|
133 |
-
metadata['ID']=metadata['timestamp'].strftime("%Y-%m-%d %H-%M-%S")+"-"+metadata['
|
134 |
metadata['Year']=metadata['timestamp'].year
|
135 |
metadata['Month']=metadata['timestamp'].month
|
136 |
metadata['Day']=int(metadata['timestamp'].strftime("%d"))
|
137 |
metadata['Hour']=metadata['timestamp'].hour
|
138 |
metadata['Minute']=metadata['timestamp'].minute
|
139 |
metadata['timestamp']=metadata['timestamp'].isoformat()
|
|
|
|
|
140 |
#md.pop("timestamp")
|
141 |
-
|
142 |
-
docs = [
|
143 |
-
Document(page_content=inStr, metadata=metadata)]
|
144 |
with open("./docs/"+metadata['ID']+".txt","w") as fd:
|
145 |
fd.write(inStr)
|
146 |
print("written to file", inStr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
try:
|
148 |
return await self.vectorstore.aadd_documents(docs,ids=[metadata['ID']])
|
149 |
except:
|
|
|
10 |
import baseInfra.dropbox_handler as dbh
|
11 |
from baseInfra.dbInterface import DbInterface
|
12 |
from uuid import UUID
|
13 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
14 |
|
15 |
|
16 |
class ChromaIntf():
|
|
|
121 |
|
122 |
|
123 |
async def addText(self,inStr:str,metadata):
|
124 |
+
# metadata expected is some of following
|
125 |
+
# timestamp --> time when added
|
126 |
+
# source --> notes/references/web/youtube/book/conversation, default conversation
|
127 |
+
# title --> of document , will be conversation when source is conversation, default blank
|
128 |
+
# author --> will default to blank
|
129 |
+
|
130 |
+
##TODO: Preprocess inStr to remove any html, markdown tags etc.
|
131 |
metadata=metadata.dict()
|
132 |
if "timestamp" not in metadata.keys():
|
133 |
metadata['timestamp']=datetime.now().isoformat()
|
|
|
136 |
pass
|
137 |
if "source" not in metadata.keys():
|
138 |
metadata['source']="conversation"
|
139 |
+
if "title" not in metadata.keys():
|
140 |
+
metadata["title"] = ""
|
141 |
+
if metadata["source"] == "conversation":
|
142 |
+
metadata["title"] == "conversation"
|
143 |
+
if "author" not in metadata.keys():
|
144 |
+
metadata["author"] = ""
|
145 |
+
|
146 |
#TODO: If url is present in input or when the splitting need to be done, then we'll need to change how we
|
147 |
# formulate the ID and may be filename to store information
|
148 |
+
metadata['ID']=metadata['timestamp'].strftime("%Y-%m-%d %H-%M-%S")+"-"+metadata['title']
|
149 |
metadata['Year']=metadata['timestamp'].year
|
150 |
metadata['Month']=metadata['timestamp'].month
|
151 |
metadata['Day']=int(metadata['timestamp'].strftime("%d"))
|
152 |
metadata['Hour']=metadata['timestamp'].hour
|
153 |
metadata['Minute']=metadata['timestamp'].minute
|
154 |
metadata['timestamp']=metadata['timestamp'].isoformat()
|
155 |
+
print("Metadata is:")
|
156 |
+
print(metadata)
|
157 |
#md.pop("timestamp")
|
|
|
|
|
|
|
158 |
with open("./docs/"+metadata['ID']+".txt","w") as fd:
|
159 |
fd.write(inStr)
|
160 |
print("written to file", inStr)
|
161 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
162 |
+
chunk_size=350,
|
163 |
+
chunk_overlap=30,
|
164 |
+
length_function=len,
|
165 |
+
is_separator_regex=False)
|
166 |
+
#docs = [ Document(page_content=inStr, metadata=metadata)]
|
167 |
+
docs=text_splitter.create_documents([inStr],[metadata])
|
168 |
+
partNumber=0
|
169 |
+
for doc in docs:
|
170 |
+
if partNumber > 0:
|
171 |
+
doc.metadata['ID']+=f"__{partNumber}"
|
172 |
+
partNumber+=1
|
173 |
+
print(f"{partNumber} follows:")
|
174 |
+
print(doc)
|
175 |
try:
|
176 |
return await self.vectorstore.aadd_documents(docs,ids=[metadata['ID']])
|
177 |
except:
|