gavinzli commited on
Commit
09dea95
·
1 Parent(s): ad04a72

Enhance email processing in mail.py to support additional file types, including Excel and ICS; update document handling and metadata extraction; modify collect function to search for specific email subjects.

Browse files
Files changed (3) hide show
  1. controllers/mail.py +102 -31
  2. test.py +1 -1
  3. token.pickle +0 -0
controllers/mail.py CHANGED
@@ -4,11 +4,13 @@ import re
4
  import base64
5
  from datetime import datetime, timedelta
6
  from venv import logger
 
7
 
8
  import pandas as pd
9
  from langchain_core.documents import Document
10
  from langchain_community.document_loaders import PyPDFLoader
11
  from langchain_community.document_loaders.image import UnstructuredImageLoader
 
12
  from langchain_community.document_loaders.csv_loader import CSVLoader
13
 
14
  from models.chroma import vectorstore
@@ -54,12 +56,36 @@ def list_emails(messages):
54
  metadata['cc'] = header['value']
55
  metadata['date'] = datetime.fromtimestamp(
56
  int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
57
- print.info(metadata, msg['id'])
58
- print("-"*100)
59
- body = ""
60
- if 'parts' in msg['payload']:
61
- attachment_documents = []
 
 
 
 
62
  for part in msg['payload']['parts']:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  if part['filename']:
64
  attachment_id = part['body']['attachmentId']
65
  logger.info("Downloading attachment: %s", part['filename'])
@@ -69,37 +95,78 @@ def list_emails(messages):
69
  path = os.path.join(".", ATTACHMENTS_DIR, part['filename'])
70
  with open(path, 'wb') as f:
71
  f.write(file_data)
72
- if part['filename'].endswith('.pdf'):
73
- attachment_documents = attachment_documents + PyPDFLoader(path).load()
74
- if part['filename'].endswith('.png'):
75
- attachment_documents = attachment_documents + UnstructuredImageLoader(path).load()
76
- if part['filename'].endswith('.csv'):
77
- attachment_documents = attachment_documents + CSVLoader(path).load()
78
- ids = []
79
- documents = []
80
- for index, document in enumerate(attachment_documents):
81
- _id = f"{msg['id']}_{index}"
82
- if 'source' in document.metadata:
83
- document.metadata['source'] = document.metadata['source'].replace(f"./{ATTACHMENTS_DIR}/", "")
84
- print(document.metadata)
85
- document.metadata.update(metadata)
86
- print(document.metadata)
87
- ids.append(_id)
88
- print(_id)
89
- print("*"*100)
90
- vectorstore.add_documents(documents=documents, ids=ids)
91
- else:
92
- ids = []
93
- documents = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
95
- body = re.sub(r'<[^>]+>', '', body) # Remove HTML tags
 
96
  documents.append(Document(
97
  page_content=body,
98
  metadata=metadata
99
  ))
100
  ids.append(msg['id'])
101
- print(msg['id'])
102
- print("!"*100)
 
 
103
  vectorstore.add_documents(documents=documents, ids=ids)
104
 
105
  def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%m/%d')):
@@ -113,10 +180,14 @@ def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%
113
  Returns:
114
  None
115
  """
116
- emails = search_emails(query)
 
 
117
  if emails:
118
  print("Found %d emails:\n", len(emails))
119
  logger.info("Found %d emails after two_weeks_ago:\n", len(emails))
 
 
120
  return f"{len(emails)} emails added to the collection."
121
  else:
122
  logger.info("No emails found after two weeks ago.")
 
4
  import base64
5
  from datetime import datetime, timedelta
6
  from venv import logger
7
+ from ics import Calendar
8
 
9
  import pandas as pd
10
  from langchain_core.documents import Document
11
  from langchain_community.document_loaders import PyPDFLoader
12
  from langchain_community.document_loaders.image import UnstructuredImageLoader
13
+ from langchain_community.document_loaders import UnstructuredExcelLoader
14
  from langchain_community.document_loaders.csv_loader import CSVLoader
15
 
16
  from models.chroma import vectorstore
 
56
  metadata['cc'] = header['value']
57
  metadata['date'] = datetime.fromtimestamp(
58
  int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
59
+ metadata['msg_id'] = msg['id']
60
+ print(metadata)
61
+ print(msg['payload']['mimeType'])
62
+ # body = ""
63
+ ids = []
64
+ documents = []
65
+ if msg['payload']['mimeType'] in ['multipart/alternative', 'multipart/related', 'multipart/mixed']:
66
+ minetype = []
67
+ # attach_docs = []
68
  for part in msg['payload']['parts']:
69
+ print("minetype: ", part['mimeType'])
70
+ minetype.append(part['mimeType'])
71
+ if part['mimeType'] == 'text/plain' and 'text/html' not in minetype:
72
+ body = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
73
+ body = re.sub(r'<[^>]+>', '', body) # Remove HTML tags
74
+ metadata['minetype'] = part['mimeType']
75
+ documents.append(Document(
76
+ page_content=body,
77
+ metadata=metadata
78
+ ))
79
+ ids.append(msg['id'])
80
+ elif part['mimeType'] == 'text/html' and 'text/plain' not in minetype:
81
+ body = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
82
+ body = re.sub(r'<[^>]+>', '', body)
83
+ metadata['minetype'] = part['mimeType']
84
+ documents.append(Document(
85
+ page_content=body,
86
+ metadata=metadata
87
+ ))
88
+ ids.append(msg['id'])
89
  if part['filename']:
90
  attachment_id = part['body']['attachmentId']
91
  logger.info("Downloading attachment: %s", part['filename'])
 
95
  path = os.path.join(".", ATTACHMENTS_DIR, part['filename'])
96
  with open(path, 'wb') as f:
97
  f.write(file_data)
98
+ if part['mimeType'] == 'application/pdf':
99
+ # attach_docs = attach_docs + PyPDFLoader(path).load()
100
+ attach_docs = PyPDFLoader(path).load()
101
+ elif part['mimeType'] == 'image/png' or part['mimeType'] == 'image/jpeg':
102
+ # attach_docs = attach_docs + UnstructuredImageLoader(path).load()
103
+ attach_docs = UnstructuredImageLoader(path).load()
104
+ elif part['filename'].endswith('.csv'):
105
+ # attach_docs = attach_docs + CSVLoader(path).load()
106
+ attach_docs = CSVLoader(path).load()
107
+ elif part['mimeType'] == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
108
+ # attach_docs = attach_docs + UnstructuredExcelLoader(path).load()
109
+ attach_docs = UnstructuredExcelLoader(path).load()
110
+ elif part['mimeType'] == 'application/ics':
111
+ with open(path, 'r', encoding='utf-8') as f:
112
+ calendar = Calendar(f.read())
113
+ for event in calendar.events:
114
+ attach_docs.append(Document(
115
+ page_content = f"Event: {event.name}\nDescription: {event.description}\nStart: {event.begin}\nEnd: {event.end}",
116
+ metadata = {
117
+ "location": event.location,
118
+ "created": event.created.strftime("%d/%m/%Y %H:%M:%S"),
119
+ "last_modified": event.last_modified.strftime("%d/%m/%Y %H:%M:%S"),
120
+ "start": event.begin.strftime("%d/%m/%Y %H:%M:%S"),
121
+ "end": event.end.strftime("%d/%m/%Y %H:%M:%S")
122
+ }
123
+ ))
124
+ if os.path.exists(path):
125
+ os.remove(path)
126
+ for index, document in enumerate(attach_docs):
127
+ _id = f"{msg['id']}_{attachment_id}_{index}"
128
+ print(document.metadata)
129
+ document.metadata['minetype'] = part['mimeType']
130
+ if 'page_label' in document.metadata:
131
+ document.metadata['page'] = document.metadata['page_label']
132
+ document.metadata['attachment'] = part['filename']
133
+ for key in ['creationdate', 'total_pages', 'creator', 'producer', 'moddate', 'page_label', 'source']:
134
+ document.metadata.pop(key, None)
135
+ document.metadata.update(metadata)
136
+ print(document.metadata)
137
+ print("-"*100)
138
+ documents.append(document)
139
+ ids.append(_id)
140
+ # for index, document in enumerate(attach_docs):
141
+ # _id = f"{msg['id']}_{index}"
142
+ # # if 'source' in document.metadata:
143
+ # # document.metadata['source'] = document.metadata['source'].replace(f"./{ATTACHMENTS_DIR}/", "")
144
+ # # document.metadata['minetype'] = part['mimeType']
145
+ # document.metadata.update(metadata)
146
+ # documents.append(document)
147
+ # ids.append(_id)
148
+ elif msg['payload']['mimeType'] == 'text/plain' and 'data' in msg['payload']['body']:
149
+ body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
150
+ body = re.sub(r'<[^>]+>', '', body)
151
+ metadata['minetype'] = msg['payload']['mimeType']
152
+ documents.append(Document(
153
+ page_content=body,
154
+ metadata=metadata
155
+ ))
156
+ ids.append(msg['id'])
157
+ elif msg['payload']['mimeType'] == 'text/html' and 'data' in msg['payload']['body']:
158
  body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
159
+ body = re.sub(r'<[^>]+>', '', body)
160
+ metadata['minetype'] = msg['payload']['mimeType']
161
  documents.append(Document(
162
  page_content=body,
163
  metadata=metadata
164
  ))
165
  ids.append(msg['id'])
166
+ if 'multipart/alternative' in minetype and len(minetype) == 1:
167
+ print("Only multipart/alternative found in the email.")
168
+ else:
169
+ print(documents)
170
  vectorstore.add_documents(documents=documents, ids=ids)
171
 
172
  def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%m/%d')):
 
180
  Returns:
181
  None
182
  """
183
+ subject_query = "subject:Re: Smartcareers algorithm debug and improvement'"
184
+ emails = search_emails(subject_query)
185
+ # emails = search_emails(query)
186
  if emails:
187
  print("Found %d emails:\n", len(emails))
188
  logger.info("Found %d emails after two_weeks_ago:\n", len(emails))
189
+ list_emails(emails)
190
+ logger.info("Listing emails...")
191
  return f"{len(emails)} emails added to the collection."
192
  else:
193
  logger.info("No emails found after two weeks ago.")
test.py CHANGED
@@ -2,4 +2,4 @@ from controllers import mail
2
 
3
  if __name__ == "__main__":
4
  mail.collect()
5
- mail.get_documents()
 
2
 
3
  if __name__ == "__main__":
4
  mail.collect()
5
+ mail.get_documents()
token.pickle CHANGED
Binary files a/token.pickle and b/token.pickle differ