gavinzli commited on
Commit
4016c6f
·
1 Parent(s): 5edd2db

Refactor email processing: update message ID handling and improve metadata usage

Browse files
Files changed (1) hide show
  1. app/controllers/mail.py +7 -9
app/controllers/mail.py CHANGED
@@ -76,11 +76,10 @@ def list_emails(service, messages):
76
  for message in messages:
77
  msg = service.users().messages().get(userId="me", id=message["id"], format="full").execute()
78
  metadata = {}
79
- logger.info("vectorstore.index_to_docstore_id: %s", list(vectorstore.index_to_docstore_id.values()))
80
- logger.info("type: %s", type(vectorstore.index_to_docstore_id.values()))
81
  for docstore_id in list(vectorstore.index_to_docstore_id.values()):
82
- if docstore_id.startswith(message["id"]):
83
- logger.info("Already indexed: %s", message["id"])
84
  continue
85
  for header in msg["payload"]["headers"]:
86
  if header["name"] == "From":
@@ -96,7 +95,6 @@ def list_emails(service, messages):
96
  "%d/%m/%Y %H:%M:%S"
97
  )
98
  metadata["user_id"] = service.users().getProfile(userId="me").execute().get("emailAddress")
99
- metadata["msg_id"] = msg["id"]
100
  # print(metadata, msg["payload"]["mimeType"])
101
  ids = []
102
  documents = []
@@ -168,7 +166,7 @@ def list_emails(service, messages):
168
  },
169
  )
170
  )
171
- ids.append(f"{msg['id']}_{attachment_id}")
172
  if os.path.exists(path):
173
  os.remove(path)
174
  for index, document in enumerate(attach_docs or []):
@@ -183,19 +181,19 @@ def list_emails(service, messages):
183
  }
184
  document.metadata.update(metadata)
185
  documents.append(document)
186
- ids.append(f"{msg['id']}_{attachment_id}_{index}")
187
  elif msg["payload"]["mimeType"] == "text/plain" and "data" in msg["payload"]["body"]:
188
  body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
189
  body = re.sub(r"<[^>]+>", "", body)
190
  metadata["mimeType"] = msg["payload"]["mimeType"]
191
  documents.append(Document(page_content=body, metadata=metadata))
192
- ids.append(msg["id"])
193
  elif msg["payload"]["mimeType"] == "text/html" and "data" in msg["payload"]["body"]:
194
  body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
195
  body = re.sub(r"<[^>]+>", "", body)
196
  metadata["mimeType"] = msg["payload"]["mimeType"]
197
  documents.append(Document(page_content=body, metadata=metadata))
198
- ids.append(msg["id"])
199
  if "multipart/alternative" in mime_types and len(mime_types) == 1:
200
  print("Only multipart/alternative found in the email.")
201
  else:
 
76
  for message in messages:
77
  msg = service.users().messages().get(userId="me", id=message["id"], format="full").execute()
78
  metadata = {}
79
+ metadata["msg_id"] = f"{msg['threadId']}-{msg['id']}"
 
80
  for docstore_id in list(vectorstore.index_to_docstore_id.values()):
81
+ if docstore_id.startswith(metadata["msg_id"]):
82
+ logger.info("Already indexed: %s", metadata["msg_id"])
83
  continue
84
  for header in msg["payload"]["headers"]:
85
  if header["name"] == "From":
 
95
  "%d/%m/%Y %H:%M:%S"
96
  )
97
  metadata["user_id"] = service.users().getProfile(userId="me").execute().get("emailAddress")
 
98
  # print(metadata, msg["payload"]["mimeType"])
99
  ids = []
100
  documents = []
 
166
  },
167
  )
168
  )
169
+ ids.append(f"{metadata["msg_id"]}_{part["filename"]}")
170
  if os.path.exists(path):
171
  os.remove(path)
172
  for index, document in enumerate(attach_docs or []):
 
181
  }
182
  document.metadata.update(metadata)
183
  documents.append(document)
184
+ ids.append(f"{metadata["msg_id"]}_{part["filename"]}_{index}")
185
  elif msg["payload"]["mimeType"] == "text/plain" and "data" in msg["payload"]["body"]:
186
  body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
187
  body = re.sub(r"<[^>]+>", "", body)
188
  metadata["mimeType"] = msg["payload"]["mimeType"]
189
  documents.append(Document(page_content=body, metadata=metadata))
190
+ ids.append(metadata["msg_id"])
191
  elif msg["payload"]["mimeType"] == "text/html" and "data" in msg["payload"]["body"]:
192
  body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
193
  body = re.sub(r"<[^>]+>", "", body)
194
  metadata["mimeType"] = msg["payload"]["mimeType"]
195
  documents.append(Document(page_content=body, metadata=metadata))
196
+ ids.append(metadata["msg_id"])
197
  if "multipart/alternative" in mime_types and len(mime_types) == 1:
198
  print("Only multipart/alternative found in the email.")
199
  else: