Refactor email processing: update message ID handling and improve metadata usage
Browse files- app/controllers/mail.py +7 -9
app/controllers/mail.py
CHANGED
@@ -76,11 +76,10 @@ def list_emails(service, messages):
|
|
76 |
for message in messages:
|
77 |
msg = service.users().messages().get(userId="me", id=message["id"], format="full").execute()
|
78 |
metadata = {}
|
79 |
-
|
80 |
-
logger.info("type: %s", type(vectorstore.index_to_docstore_id.values()))
|
81 |
for docstore_id in list(vectorstore.index_to_docstore_id.values()):
|
82 |
-
if docstore_id.startswith(
|
83 |
-
logger.info("Already indexed: %s",
|
84 |
continue
|
85 |
for header in msg["payload"]["headers"]:
|
86 |
if header["name"] == "From":
|
@@ -96,7 +95,6 @@ def list_emails(service, messages):
|
|
96 |
"%d/%m/%Y %H:%M:%S"
|
97 |
)
|
98 |
metadata["user_id"] = service.users().getProfile(userId="me").execute().get("emailAddress")
|
99 |
-
metadata["msg_id"] = msg["id"]
|
100 |
# print(metadata, msg["payload"]["mimeType"])
|
101 |
ids = []
|
102 |
documents = []
|
@@ -168,7 +166,7 @@ def list_emails(service, messages):
|
|
168 |
},
|
169 |
)
|
170 |
)
|
171 |
-
ids.append(f"{
|
172 |
if os.path.exists(path):
|
173 |
os.remove(path)
|
174 |
for index, document in enumerate(attach_docs or []):
|
@@ -183,19 +181,19 @@ def list_emails(service, messages):
|
|
183 |
}
|
184 |
document.metadata.update(metadata)
|
185 |
documents.append(document)
|
186 |
-
ids.append(f"{
|
187 |
elif msg["payload"]["mimeType"] == "text/plain" and "data" in msg["payload"]["body"]:
|
188 |
body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
|
189 |
body = re.sub(r"<[^>]+>", "", body)
|
190 |
metadata["mimeType"] = msg["payload"]["mimeType"]
|
191 |
documents.append(Document(page_content=body, metadata=metadata))
|
192 |
-
ids.append(
|
193 |
elif msg["payload"]["mimeType"] == "text/html" and "data" in msg["payload"]["body"]:
|
194 |
body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
|
195 |
body = re.sub(r"<[^>]+>", "", body)
|
196 |
metadata["mimeType"] = msg["payload"]["mimeType"]
|
197 |
documents.append(Document(page_content=body, metadata=metadata))
|
198 |
-
ids.append(
|
199 |
if "multipart/alternative" in mime_types and len(mime_types) == 1:
|
200 |
print("Only multipart/alternative found in the email.")
|
201 |
else:
|
|
|
76 |
for message in messages:
|
77 |
msg = service.users().messages().get(userId="me", id=message["id"], format="full").execute()
|
78 |
metadata = {}
|
79 |
+
metadata["msg_id"] = f"{msg['threadId']}-{msg['id']}"
|
|
|
80 |
for docstore_id in list(vectorstore.index_to_docstore_id.values()):
|
81 |
+
if docstore_id.startswith(metadata["msg_id"]):
|
82 |
+
logger.info("Already indexed: %s", metadata["msg_id"])
|
83 |
continue
|
84 |
for header in msg["payload"]["headers"]:
|
85 |
if header["name"] == "From":
|
|
|
95 |
"%d/%m/%Y %H:%M:%S"
|
96 |
)
|
97 |
metadata["user_id"] = service.users().getProfile(userId="me").execute().get("emailAddress")
|
|
|
98 |
# print(metadata, msg["payload"]["mimeType"])
|
99 |
ids = []
|
100 |
documents = []
|
|
|
166 |
},
|
167 |
)
|
168 |
)
|
169 |
+
ids.append(f"{metadata["msg_id"]}_{part["filename"]}")
|
170 |
if os.path.exists(path):
|
171 |
os.remove(path)
|
172 |
for index, document in enumerate(attach_docs or []):
|
|
|
181 |
}
|
182 |
document.metadata.update(metadata)
|
183 |
documents.append(document)
|
184 |
+
ids.append(f"{metadata["msg_id"]}_{part["filename"]}_{index}")
|
185 |
elif msg["payload"]["mimeType"] == "text/plain" and "data" in msg["payload"]["body"]:
|
186 |
body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
|
187 |
body = re.sub(r"<[^>]+>", "", body)
|
188 |
metadata["mimeType"] = msg["payload"]["mimeType"]
|
189 |
documents.append(Document(page_content=body, metadata=metadata))
|
190 |
+
ids.append(metadata["msg_id"])
|
191 |
elif msg["payload"]["mimeType"] == "text/html" and "data" in msg["payload"]["body"]:
|
192 |
body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
|
193 |
body = re.sub(r"<[^>]+>", "", body)
|
194 |
metadata["mimeType"] = msg["payload"]["mimeType"]
|
195 |
documents.append(Document(page_content=body, metadata=metadata))
|
196 |
+
ids.append(metadata["msg_id"])
|
197 |
if "multipart/alternative" in mime_types and len(mime_types) == 1:
|
198 |
print("Only multipart/alternative found in the email.")
|
199 |
else:
|