anubhav77 commited on
Commit
c824142
·
1 Parent(s): d462b73
Files changed (1) hide show
  1. src/chromaIntf.py +32 -4
src/chromaIntf.py CHANGED
@@ -10,6 +10,7 @@ from datetime import datetime
10
  import baseInfra.dropbox_handler as dbh
11
  from baseInfra.dbInterface import DbInterface
12
  from uuid import UUID
 
13
 
14
 
15
  class ChromaIntf():
@@ -120,6 +121,13 @@ class ChromaIntf():
120
 
121
 
122
  async def addText(self,inStr:str,metadata):
 
 
 
 
 
 
 
123
  metadata=metadata.dict()
124
  if "timestamp" not in metadata.keys():
125
  metadata['timestamp']=datetime.now().isoformat()
@@ -128,22 +136,42 @@ class ChromaIntf():
128
  pass
129
  if "source" not in metadata.keys():
130
  metadata['source']="conversation"
 
 
 
 
 
 
 
131
  #TODO: If url is present in input or when the splitting need to be done, then we'll need to change how we
132
  # formulate the ID and may be filename to store information
133
- metadata['ID']=metadata['timestamp'].strftime("%Y-%m-%d %H-%M-%S")+"-"+metadata['source']
134
  metadata['Year']=metadata['timestamp'].year
135
  metadata['Month']=metadata['timestamp'].month
136
  metadata['Day']=int(metadata['timestamp'].strftime("%d"))
137
  metadata['Hour']=metadata['timestamp'].hour
138
  metadata['Minute']=metadata['timestamp'].minute
139
  metadata['timestamp']=metadata['timestamp'].isoformat()
 
 
140
  #md.pop("timestamp")
141
-
142
- docs = [
143
- Document(page_content=inStr, metadata=metadata)]
144
  with open("./docs/"+metadata['ID']+".txt","w") as fd:
145
  fd.write(inStr)
146
  print("written to file", inStr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  try:
148
  return await self.vectorstore.aadd_documents(docs,ids=[metadata['ID']])
149
  except:
 
10
  import baseInfra.dropbox_handler as dbh
11
  from baseInfra.dbInterface import DbInterface
12
  from uuid import UUID
13
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
14
 
15
 
16
  class ChromaIntf():
 
121
 
122
 
123
  async def addText(self,inStr:str,metadata):
124
+ # metadata expected is some of following
125
+ # timestamp --> time when added
126
+ # source --> notes/references/web/youtube/book/conversation, default conversation
127
+ # title --> of document , will be conversation when source is conversation, default blank
128
+ # author --> will default to blank
129
+
130
+ ##TODO: Preprocess inStr to remove any html, markdown tags etc.
131
  metadata=metadata.dict()
132
  if "timestamp" not in metadata.keys():
133
  metadata['timestamp']=datetime.now().isoformat()
 
136
  pass
137
  if "source" not in metadata.keys():
138
  metadata['source']="conversation"
139
+ if "title" not in metadata.keys():
140
+ metadata["title"] = ""
141
+ if metadata["source"] == "conversation":
142
+ metadata["title"] == "conversation"
143
+ if "author" not in metadata.keys():
144
+ metadata["author"] = ""
145
+
146
  #TODO: If url is present in input or when the splitting need to be done, then we'll need to change how we
147
  # formulate the ID and may be filename to store information
148
+ metadata['ID']=metadata['timestamp'].strftime("%Y-%m-%d %H-%M-%S")+"-"+metadata['title']
149
  metadata['Year']=metadata['timestamp'].year
150
  metadata['Month']=metadata['timestamp'].month
151
  metadata['Day']=int(metadata['timestamp'].strftime("%d"))
152
  metadata['Hour']=metadata['timestamp'].hour
153
  metadata['Minute']=metadata['timestamp'].minute
154
  metadata['timestamp']=metadata['timestamp'].isoformat()
155
+ print("Metadata is:")
156
+ print(metadata)
157
  #md.pop("timestamp")
 
 
 
158
  with open("./docs/"+metadata['ID']+".txt","w") as fd:
159
  fd.write(inStr)
160
  print("written to file", inStr)
161
+ text_splitter = RecursiveCharacterTextSplitter(
162
+ chunk_size=350,
163
+ chunk_overlap=30,
164
+ length_function=len,
165
+ is_separator_regex=False)
166
+ #docs = [ Document(page_content=inStr, metadata=metadata)]
167
+ docs=text_splitter.create_documents([inStr],[metadata])
168
+ partNumber=0
169
+ for doc in docs:
170
+ if partNumber > 0:
171
+ doc.metadata['ID']+=f"__{partNumber}"
172
+ partNumber+=1
173
+ print(f"{partNumber} follows:")
174
+ print(doc)
175
  try:
176
  return await self.vectorstore.aadd_documents(docs,ids=[metadata['ID']])
177
  except: