gavinzli commited on
Commit
ed24eda
·
1 Parent(s): 355e0f6

Enhance email ID construction: use SHA-256 hash for file data and improve error logging for document addition

Browse files
Files changed (1) hide show
  1. app/controllers/mail.py +7 -3
app/controllers/mail.py CHANGED
@@ -2,6 +2,7 @@
2
  import os
3
  import re
4
  import base64
 
5
  from datetime import datetime, timedelta
6
  from venv import logger
7
  from ics import Calendar
@@ -166,7 +167,7 @@ def list_emails(service, messages):
166
  },
167
  )
168
  )
169
- ids.append(f"{metadata['msg_id']}_{part['filename']}")
170
  if os.path.exists(path):
171
  os.remove(path)
172
  for index, document in enumerate(attach_docs or []):
@@ -181,7 +182,7 @@ def list_emails(service, messages):
181
  }
182
  document.metadata.update(metadata)
183
  documents.append(document)
184
- ids.append(f"{metadata['msg_id']}_{part['filename']}_{index}")
185
  elif msg["payload"]["mimeType"] == "text/plain" and "data" in msg["payload"]["body"]:
186
  body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
187
  body = re.sub(r"<[^>]+>", "", body)
@@ -197,7 +198,10 @@ def list_emails(service, messages):
197
  if "multipart/alternative" in mime_types and len(mime_types) == 1:
198
  print("Only multipart/alternative found in the email.")
199
  else:
200
- vectorstore.add_documents(documents=documents, ids=ids)
 
 
 
201
 
202
 
203
  def collect(service, query=(datetime.today() - timedelta(days=10)).strftime("after:%Y/%m/%d")):
 
2
  import os
3
  import re
4
  import base64
5
+ import hashlib
6
  from datetime import datetime, timedelta
7
  from venv import logger
8
  from ics import Calendar
 
167
  },
168
  )
169
  )
170
+ ids.append(f"{metadata['msg_id']}-{part['filename']}-{hashlib.sha256(file_data).hexdigest()}")
171
  if os.path.exists(path):
172
  os.remove(path)
173
  for index, document in enumerate(attach_docs or []):
 
182
  }
183
  document.metadata.update(metadata)
184
  documents.append(document)
185
+ ids.append(f"{metadata['msg_id']}-{hashlib.sha256(file_data).hexdigest()}-{index}")
186
  elif msg["payload"]["mimeType"] == "text/plain" and "data" in msg["payload"]["body"]:
187
  body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
188
  body = re.sub(r"<[^>]+>", "", body)
 
198
  if "multipart/alternative" in mime_types and len(mime_types) == 1:
199
  print("Only multipart/alternative found in the email.")
200
  else:
201
+ try:
202
+ vectorstore.add_documents(documents=documents, ids=ids)
203
+ except Exception as e:
204
+ logger.error("Error adding documents to vectorstore: %s", e)
205
 
206
 
207
  def collect(service, query=(datetime.today() - timedelta(days=10)).strftime("after:%Y/%m/%d")):