Enhance email processing in mail.py to support additional file types, including Excel and ICS; update document handling and metadata extraction; modify collect function to search for specific email subjects.
Browse files- controllers/mail.py +102 -31
- test.py +1 -1
- token.pickle +0 -0
controllers/mail.py
CHANGED
@@ -4,11 +4,13 @@ import re
|
|
4 |
import base64
|
5 |
from datetime import datetime, timedelta
|
6 |
from venv import logger
|
|
|
7 |
|
8 |
import pandas as pd
|
9 |
from langchain_core.documents import Document
|
10 |
from langchain_community.document_loaders import PyPDFLoader
|
11 |
from langchain_community.document_loaders.image import UnstructuredImageLoader
|
|
|
12 |
from langchain_community.document_loaders.csv_loader import CSVLoader
|
13 |
|
14 |
from models.chroma import vectorstore
|
@@ -54,12 +56,36 @@ def list_emails(messages):
|
|
54 |
metadata['cc'] = header['value']
|
55 |
metadata['date'] = datetime.fromtimestamp(
|
56 |
int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
|
57 |
-
|
58 |
-
print(
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
62 |
for part in msg['payload']['parts']:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
if part['filename']:
|
64 |
attachment_id = part['body']['attachmentId']
|
65 |
logger.info("Downloading attachment: %s", part['filename'])
|
@@ -69,37 +95,78 @@ def list_emails(messages):
|
|
69 |
path = os.path.join(".", ATTACHMENTS_DIR, part['filename'])
|
70 |
with open(path, 'wb') as f:
|
71 |
f.write(file_data)
|
72 |
-
if part['
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
|
95 |
-
body = re.sub(r'<[^>]+>', '', body)
|
|
|
96 |
documents.append(Document(
|
97 |
page_content=body,
|
98 |
metadata=metadata
|
99 |
))
|
100 |
ids.append(msg['id'])
|
101 |
-
|
102 |
-
print("
|
|
|
|
|
103 |
vectorstore.add_documents(documents=documents, ids=ids)
|
104 |
|
105 |
def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%m/%d')):
|
@@ -113,10 +180,14 @@ def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%
|
|
113 |
Returns:
|
114 |
None
|
115 |
"""
|
116 |
-
|
|
|
|
|
117 |
if emails:
|
118 |
print("Found %d emails:\n", len(emails))
|
119 |
logger.info("Found %d emails after two_weeks_ago:\n", len(emails))
|
|
|
|
|
120 |
return f"{len(emails)} emails added to the collection."
|
121 |
else:
|
122 |
logger.info("No emails found after two weeks ago.")
|
|
|
4 |
import base64
|
5 |
from datetime import datetime, timedelta
|
6 |
from venv import logger
|
7 |
+
from ics import Calendar
|
8 |
|
9 |
import pandas as pd
|
10 |
from langchain_core.documents import Document
|
11 |
from langchain_community.document_loaders import PyPDFLoader
|
12 |
from langchain_community.document_loaders.image import UnstructuredImageLoader
|
13 |
+
from langchain_community.document_loaders import UnstructuredExcelLoader
|
14 |
from langchain_community.document_loaders.csv_loader import CSVLoader
|
15 |
|
16 |
from models.chroma import vectorstore
|
|
|
56 |
metadata['cc'] = header['value']
|
57 |
metadata['date'] = datetime.fromtimestamp(
|
58 |
int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
|
59 |
+
metadata['msg_id'] = msg['id']
|
60 |
+
print(metadata)
|
61 |
+
print(msg['payload']['mimeType'])
|
62 |
+
# body = ""
|
63 |
+
ids = []
|
64 |
+
documents = []
|
65 |
+
if msg['payload']['mimeType'] in ['multipart/alternative', 'multipart/related', 'multipart/mixed']:
|
66 |
+
minetype = []
|
67 |
+
# attach_docs = []
|
68 |
for part in msg['payload']['parts']:
|
69 |
+
print("minetype: ", part['mimeType'])
|
70 |
+
minetype.append(part['mimeType'])
|
71 |
+
if part['mimeType'] == 'text/plain' and 'text/html' not in minetype:
|
72 |
+
body = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
|
73 |
+
body = re.sub(r'<[^>]+>', '', body) # Remove HTML tags
|
74 |
+
metadata['minetype'] = part['mimeType']
|
75 |
+
documents.append(Document(
|
76 |
+
page_content=body,
|
77 |
+
metadata=metadata
|
78 |
+
))
|
79 |
+
ids.append(msg['id'])
|
80 |
+
elif part['mimeType'] == 'text/html' and 'text/plain' not in minetype:
|
81 |
+
body = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
|
82 |
+
body = re.sub(r'<[^>]+>', '', body)
|
83 |
+
metadata['minetype'] = part['mimeType']
|
84 |
+
documents.append(Document(
|
85 |
+
page_content=body,
|
86 |
+
metadata=metadata
|
87 |
+
))
|
88 |
+
ids.append(msg['id'])
|
89 |
if part['filename']:
|
90 |
attachment_id = part['body']['attachmentId']
|
91 |
logger.info("Downloading attachment: %s", part['filename'])
|
|
|
95 |
path = os.path.join(".", ATTACHMENTS_DIR, part['filename'])
|
96 |
with open(path, 'wb') as f:
|
97 |
f.write(file_data)
|
98 |
+
if part['mimeType'] == 'application/pdf':
|
99 |
+
# attach_docs = attach_docs + PyPDFLoader(path).load()
|
100 |
+
attach_docs = PyPDFLoader(path).load()
|
101 |
+
elif part['mimeType'] == 'image/png' or part['mimeType'] == 'image/jpeg':
|
102 |
+
# attach_docs = attach_docs + UnstructuredImageLoader(path).load()
|
103 |
+
attach_docs = UnstructuredImageLoader(path).load()
|
104 |
+
elif part['filename'].endswith('.csv'):
|
105 |
+
# attach_docs = attach_docs + CSVLoader(path).load()
|
106 |
+
attach_docs = CSVLoader(path).load()
|
107 |
+
elif part['mimeType'] == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
|
108 |
+
# attach_docs = attach_docs + UnstructuredExcelLoader(path).load()
|
109 |
+
attach_docs = UnstructuredExcelLoader(path).load()
|
110 |
+
elif part['mimeType'] == 'application/ics':
|
111 |
+
with open(path, 'r', encoding='utf-8') as f:
|
112 |
+
calendar = Calendar(f.read())
|
113 |
+
for event in calendar.events:
|
114 |
+
attach_docs.append(Document(
|
115 |
+
page_content = f"Event: {event.name}\nDescription: {event.description}\nStart: {event.begin}\nEnd: {event.end}",
|
116 |
+
metadata = {
|
117 |
+
"location": event.location,
|
118 |
+
"created": event.created.strftime("%d/%m/%Y %H:%M:%S"),
|
119 |
+
"last_modified": event.last_modified.strftime("%d/%m/%Y %H:%M:%S"),
|
120 |
+
"start": event.begin.strftime("%d/%m/%Y %H:%M:%S"),
|
121 |
+
"end": event.end.strftime("%d/%m/%Y %H:%M:%S")
|
122 |
+
}
|
123 |
+
))
|
124 |
+
if os.path.exists(path):
|
125 |
+
os.remove(path)
|
126 |
+
for index, document in enumerate(attach_docs):
|
127 |
+
_id = f"{msg['id']}_{attachment_id}_{index}"
|
128 |
+
print(document.metadata)
|
129 |
+
document.metadata['minetype'] = part['mimeType']
|
130 |
+
if 'page_label' in document.metadata:
|
131 |
+
document.metadata['page'] = document.metadata['page_label']
|
132 |
+
document.metadata['attachment'] = part['filename']
|
133 |
+
for key in ['creationdate', 'total_pages', 'creator', 'producer', 'moddate', 'page_label', 'source']:
|
134 |
+
document.metadata.pop(key, None)
|
135 |
+
document.metadata.update(metadata)
|
136 |
+
print(document.metadata)
|
137 |
+
print("-"*100)
|
138 |
+
documents.append(document)
|
139 |
+
ids.append(_id)
|
140 |
+
# for index, document in enumerate(attach_docs):
|
141 |
+
# _id = f"{msg['id']}_{index}"
|
142 |
+
# # if 'source' in document.metadata:
|
143 |
+
# # document.metadata['source'] = document.metadata['source'].replace(f"./{ATTACHMENTS_DIR}/", "")
|
144 |
+
# # document.metadata['minetype'] = part['mimeType']
|
145 |
+
# document.metadata.update(metadata)
|
146 |
+
# documents.append(document)
|
147 |
+
# ids.append(_id)
|
148 |
+
elif msg['payload']['mimeType'] == 'text/plain' and 'data' in msg['payload']['body']:
|
149 |
+
body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
|
150 |
+
body = re.sub(r'<[^>]+>', '', body)
|
151 |
+
metadata['minetype'] = msg['payload']['mimeType']
|
152 |
+
documents.append(Document(
|
153 |
+
page_content=body,
|
154 |
+
metadata=metadata
|
155 |
+
))
|
156 |
+
ids.append(msg['id'])
|
157 |
+
elif msg['payload']['mimeType'] == 'text/html' and 'data' in msg['payload']['body']:
|
158 |
body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
|
159 |
+
body = re.sub(r'<[^>]+>', '', body)
|
160 |
+
metadata['minetype'] = msg['payload']['mimeType']
|
161 |
documents.append(Document(
|
162 |
page_content=body,
|
163 |
metadata=metadata
|
164 |
))
|
165 |
ids.append(msg['id'])
|
166 |
+
if 'multipart/alternative' in minetype and len(minetype) == 1:
|
167 |
+
print("Only multipart/alternative found in the email.")
|
168 |
+
else:
|
169 |
+
print(documents)
|
170 |
vectorstore.add_documents(documents=documents, ids=ids)
|
171 |
|
172 |
def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%m/%d')):
|
|
|
180 |
Returns:
|
181 |
None
|
182 |
"""
|
183 |
+
subject_query = "subject:Re: Smartcareers algorithm debug and improvement'"
|
184 |
+
emails = search_emails(subject_query)
|
185 |
+
# emails = search_emails(query)
|
186 |
if emails:
|
187 |
print("Found %d emails:\n", len(emails))
|
188 |
logger.info("Found %d emails after two_weeks_ago:\n", len(emails))
|
189 |
+
list_emails(emails)
|
190 |
+
logger.info("Listing emails...")
|
191 |
return f"{len(emails)} emails added to the collection."
|
192 |
else:
|
193 |
logger.info("No emails found after two weeks ago.")
|
test.py
CHANGED
@@ -2,4 +2,4 @@ from controllers import mail
|
|
2 |
|
3 |
if __name__ == "__main__":
|
4 |
mail.collect()
|
5 |
-
mail.get_documents()
|
|
|
2 |
|
3 |
if __name__ == "__main__":
|
4 |
mail.collect()
|
5 |
+
mail.get_documents()
|
token.pickle
CHANGED
Binary files a/token.pickle and b/token.pickle differ
|
|