gavinzli commited on
Commit
af61c79
·
1 Parent(s): a111c12

Refactor email handling: remove mail module, update service routes, and enhance EmailQuery model with additional parameters

Browse files
.gitignore CHANGED
@@ -183,3 +183,4 @@ models/chroma/data/*.bin
183
  models/chroma/_data/chroma.sqlite3
184
  models/chroma/data/chroma.sqlite3
185
  cache
 
 
183
  models/chroma/_data/chroma.sqlite3
184
  models/chroma/data/chroma.sqlite3
185
  cache
186
+ _cache
app/controllers/__init__.py DELETED
File without changes
app/controllers/mail.py DELETED
@@ -1,318 +0,0 @@
1
- """Module to search and list emails from Gmail."""
2
- import base64
3
- import hashlib
4
- import os
5
- import re
6
- from datetime import datetime, timedelta
7
- from venv import logger
8
-
9
- from ics import Calendar
10
- from langchain_community.document_loaders import (
11
- CSVLoader,
12
- PyPDFLoader,
13
- UnstructuredExcelLoader,
14
- UnstructuredImageLoader,
15
- )
16
- from langchain_core.documents import Document
17
- from models.db import vectorstore
18
-
19
- SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
20
- EMAIL_PATTERN = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
21
-
22
- ATTACHMENTS_DIR = "cache"
23
- os.makedirs(ATTACHMENTS_DIR, exist_ok=True)
24
-
25
- def build_query(params):
26
- """
27
- Constructs a query string based on the provided parameters.
28
-
29
- Args:
30
- params (dict): A dictionary containing optional query parameters.
31
- Supported keys include:
32
- - 'subject' (str): The subject of the email.
33
- - 'from' (str): The sender's email address.
34
- - 'to' (str): The recipient's email address.
35
- - 'cc' (str): The CC recipient's email address.
36
- - 'after' (str): A date string to filter emails sent after this date.
37
- - 'before' (str): A date string to filter emails sent before this date.
38
-
39
- Returns:
40
- str: A query string constructed from the provided parameters. Each parameter
41
- is formatted as a key-value pair and joined by spaces. If a parameter is not
42
- provided or is empty, it is excluded from the query string.
43
- """
44
- query_parts = []
45
- if 'subject' in params and params['subject']:
46
- query_parts.append(f'subject:"{params["subject"]}"')
47
- if 'from' in params and params['from']:
48
- query_parts.append(f'from:{params["from"]}')
49
- if 'to' in params and params['to']:
50
- query_parts.append(f'to:{params["to"]}')
51
- if 'cc' in params and params['cc']:
52
- query_parts.append(f'cc:{params["cc"]}')
53
- if 'after' in params and params['after']:
54
- query_parts.append(f'after:{params["after"]}')
55
- if 'before' in params and params['before']:
56
- query_parts.append(f'before:{params["before"]}')
57
- return ' '.join(query_parts)
58
-
59
- def search_emails(service, query):
60
- """Search emails based on a query."""
61
- result = service.users().messages().list(userId="me", q=query).execute()
62
- messages = []
63
- if "messages" in result:
64
- messages.extend(result["messages"])
65
- while "nextPageToken" in result:
66
- page_token = result["nextPageToken"]
67
- result = (
68
- service.users().messages().list(userId="me", q=query, pageToken=page_token).execute()
69
- )
70
- if "messages" in result:
71
- messages.extend(result["messages"])
72
- return messages
73
-
74
-
75
- def list_emails(service, messages):
76
- """
77
- Processes a list of email messages, extracts metadata, decodes content, and handles attachments.
78
-
79
- Args:
80
- messages (list): A list of email message dictionaries, where each dictionary contains
81
- at least an 'id' key representing the email's unique identifier.
82
-
83
- Returns:
84
- None: The function processes the emails and adds the extracted documents to a vector store.
85
-
86
- Functionality:
87
- - Retrieves email details using the Gmail API.
88
- - Extracts metadata such as sender, recipient, subject, CC, and date.
89
- - Decodes email content in plain text or HTML format.
90
- - Handles multipart emails, including attachments.
91
- - Processes attachments based on their MIME type:
92
- - PDF files are loaded using PyPDFLoader.
93
- - Images (PNG, JPEG) are loaded using UnstructuredImageLoader.
94
- - CSV files are loaded using CSVLoader.
95
- - Excel files are loaded using UnstructuredExcelLoader.
96
- - Calendar files (ICS) are parsed to extract event details.
97
- - Removes HTML tags from email content.
98
- - Stores processed documents and metadata in a vector store.
99
- - Deletes temporary files created during attachment processing.
100
-
101
- Notes:
102
- - The function assumes the existence of a global `service` object for Gmail API.
103
- - The `vectorstore.add_documents` method is used to store the processed documents.
104
- - Attachments are temporarily saved in `ATTACHMENTS_DIR` and deleted after processing.
105
- - The function logs information about attachments being downloaded.
106
- """
107
- ids = []
108
- documents = []
109
- for message in messages:
110
- msg = service.users().messages().get(userId="me", id=message["id"], format="full").execute()
111
- metadata = {}
112
- metadata["threadId"] = msg["threadId"]
113
- metadata["msgId"] = msg["id"]
114
- msgId = f"{msg['threadId']}-{msg['id']}"
115
- for header in msg["payload"]["headers"]:
116
- if header["name"] == "From":
117
- metadata["from"] = header["value"]
118
- elif header["name"] == "To":
119
- metadata["to"] = header["value"]
120
- elif header["name"] == "Subject":
121
- metadata["subject"] = header["value"]
122
- logger.info("subject: %s", metadata["subject"])
123
- elif header["name"] == "Cc":
124
- metadata["cc"] = header["value"]
125
- metadata["date"] = datetime.fromtimestamp(int(msg["internalDate"]) / 1000).strftime(
126
- "%d/%m/%Y %H:%M:%S"
127
- )
128
- metadata["userId"] = service.users().getProfile(userId="me").execute().get("emailAddress")
129
- ids = []
130
- documents = []
131
- mime_types = []
132
- if msg["payload"]["mimeType"] in [
133
- "multipart/alternative",
134
- "multipart/related",
135
- "multipart/mixed",
136
- ]:
137
- mime_types = []
138
- attach_docs = []
139
- for part in msg["payload"]["parts"]:
140
- print("mimeType: ", part["mimeType"])
141
- mime_types.append(part["mimeType"])
142
- if part["mimeType"] == "text/plain" and "text/html" not in mime_types:
143
- body = base64.urlsafe_b64decode(part["body"]["data"]).decode("utf-8")
144
- body = re.sub(r"<[^>]+>", "", body) # Remove HTML tags
145
- metadata["mimeType"] = part["mimeType"]
146
- documents.append(Document(page_content=body, metadata=metadata))
147
- ids.append(msg["id"])
148
- elif part["mimeType"] == "text/html" and "text/plain" not in mime_types:
149
- body = base64.urlsafe_b64decode(part["body"]["data"]).decode("utf-8")
150
- body = re.sub(r"<[^>]+>", "", body)
151
- metadata["mimeType"] = part["mimeType"]
152
- documents.append(Document(page_content=body, metadata=metadata))
153
- ids.append(msg["id"])
154
- if part["filename"]:
155
- attachment_id = part["body"]["attachmentId"]
156
- logger.info("Downloading attachment: %s", part["filename"])
157
- attachment = (
158
- service.users()
159
- .messages()
160
- .attachments()
161
- .get(userId="me", messageId=message["id"], id=attachment_id)
162
- .execute()
163
- )
164
- file_data = base64.urlsafe_b64decode(attachment["data"].encode("UTF-8"))
165
- path = os.path.join(".", ATTACHMENTS_DIR, part["filename"])
166
- with open(path, "wb") as f:
167
- f.write(file_data)
168
- if part["mimeType"] == "application/pdf":
169
- attach_docs = PyPDFLoader(path).load()
170
- elif part["mimeType"] == "image/png" or part["mimeType"] == "image/jpeg":
171
- try:
172
- attach_docs = UnstructuredImageLoader(path).load()
173
- except Exception as e:
174
- logger.error("Error loading image: %s", e)
175
- elif part["filename"].endswith(".csv"):
176
- attach_docs = CSVLoader(path).load()
177
- elif (
178
- part["mimeType"]
179
- == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
180
- ):
181
- attach_docs = UnstructuredExcelLoader(path).load()
182
- elif part["mimeType"] == "application/ics":
183
- with open(path, "r", encoding="utf-8") as f:
184
- calendar = Calendar(f.read())
185
- for event in calendar.events:
186
- documents.append(
187
- Document(
188
- page_content=f"Event: {event.name}\n\Description: {event.description}\nStart: {event.begin}\nEnd: {event.end}",
189
- metadata={
190
- "attachment": part["filename"],
191
- "mimeType": part["mimeType"],
192
- "location": event.location,
193
- "created": event.created.strftime("%d/%m/%Y %H:%M:%S"),
194
- "last_modified": event.last_modified.strftime(
195
- "%d/%m/%Y %H:%M:%S"
196
- ),
197
- "start": event.begin.strftime("%d/%m/%Y %H:%M:%S"),
198
- "end": event.end.strftime("%d/%m/%Y %H:%M:%S"),
199
- },
200
- )
201
- )
202
- ids.append(f"{msgId}-{part['filename']}-{hashlib.sha256(file_data).hexdigest()}")
203
- if os.path.exists(path):
204
- os.remove(path)
205
- for index, document in enumerate(attach_docs or []):
206
- document.metadata["mimeType"] = part["mimeType"]
207
- if "page_label" in document.metadata:
208
- document.metadata["page"] = document.metadata["page_label"]
209
- document.metadata["attachment"] = part["filename"]
210
- document.metadata = {
211
- key: value
212
- for key, value in document.metadata.items()
213
- if key in ["attachment", "page"]
214
- }
215
- document.metadata.update(metadata)
216
- documents.append(document)
217
- ids.append(f"{msgId}-{hashlib.sha256(file_data).hexdigest()}-{index}")
218
- elif msg["payload"]["mimeType"] == "text/plain" and "data" in msg["payload"]["body"]:
219
- body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
220
- body = re.sub(r"<[^>]+>", "", body)
221
- metadata["mimeType"] = msg["payload"]["mimeType"]
222
- documents.append(Document(page_content=body, metadata=metadata))
223
- ids.append(msgId)
224
- elif msg["payload"]["mimeType"] == "text/html" and "data" in msg["payload"]["body"]:
225
- body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
226
- body = re.sub(r"<[^>]+>", "", body)
227
- metadata["mimeType"] = msg["payload"]["mimeType"]
228
- documents.append(Document(page_content=body, metadata=metadata))
229
- ids.append(msgId)
230
- if "multipart/alternative" in mime_types and len(mime_types) == 1:
231
- print("Only multipart/alternative found in the email.")
232
- else:
233
- try:
234
- vectorstore.add_documents(documents=documents, ids=ids)
235
- except Exception as e:
236
- logger.error("Error adding documents to vectorstore: %s", e)
237
-
238
-
239
- def collect(service, query=(datetime.today() - timedelta(days=10)).strftime("after:%Y/%m/%d")):
240
- """
241
- Main function to search and list emails from Gmail.
242
-
243
- This function builds a Gmail service, constructs a query to search for emails
244
- received in the last 14 days, and lists the found emails. If no emails are found,
245
- it prints a message indicating so.
246
-
247
- Returns:
248
- None
249
- """
250
- # query = "subject:Re: Smartcareers algorithm debug and improvement'"
251
- emails = search_emails(service, query)
252
- if emails:
253
- logger.info("Found %d emails:\n", len(emails))
254
- logger.info("Found %d emails after two_weeks_ago:\n", len(emails))
255
- list_emails(service, emails)
256
- logger.info("Listing emails...")
257
- return f"{len(emails)} emails added to the collection."
258
- else:
259
- logger.info("No emails found after two weeks ago.")
260
-
261
-
262
- def get_emails(service, query, max_results=10):
263
- """
264
- Retrieve a list of emails with subject, to, from, cc, and content.
265
-
266
- Args:
267
- mailservice: Authenticated Gmail API service instance
268
- max_results: Maximum number of emails to retrieve
269
-
270
- Returns:
271
- List of dictionaries containing email details
272
- """
273
- try:
274
- # List messages
275
- query = build_query(query.dict())
276
- response = service.users().messages().list(
277
- userId='me', q=query, maxResults=max_results).execute()
278
- messages = response.get('messages', [])
279
- email_list = []
280
- if not messages:
281
- return email_list
282
- for message in messages:
283
- msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()
284
- headers = msg['payload']['headers']
285
- email_data = {
286
- 'subject': '',
287
- 'from': '',
288
- 'to': '',
289
- 'cc': '',
290
- 'content': '',
291
- 'snippet': msg['snippet'] if 'snippet' in msg else '',
292
- }
293
- for header in headers:
294
- name = header['name'].lower()
295
- if name == 'subject':
296
- email_data['subject'] = header['value']
297
- elif name == 'from':
298
- email_data['from'] = header['value']
299
- elif name == 'to':
300
- email_data['to'] = header['value']
301
- elif name == 'cc':
302
- email_data['cc'] = header['value']
303
- if 'parts' in msg['payload']:
304
- for part in msg['payload']['parts']:
305
- if part['mimeType'] == 'text/plain':
306
- email_data['content'] = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
307
- break
308
- elif part['mimeType'] == 'text/html':
309
- email_data['content'] = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
310
- break
311
- elif 'data' in msg['payload']['body']:
312
- email_data['content'] = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
313
- email_list.append(email_data)
314
- return email_list
315
-
316
- except Exception as e:
317
- print(f"An error occurred: {e}")
318
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/main.py CHANGED
@@ -4,7 +4,7 @@ import logging
4
  from fastapi import FastAPI, Request
5
  from fastapi.middleware.cors import CORSMiddleware
6
  from jose import jwt
7
- from router import auth, content, mail
8
  from starlette.middleware.base import BaseHTTPMiddleware
9
 
10
  SECRET_KEY = "your-secret-key"
@@ -65,7 +65,7 @@ logging.getLogger().setLevel(logging.INFO)
65
  app = FastAPI(docs_url="/")
66
 
67
  app.include_router(content.router)
68
- app.include_router(mail.router)
69
  app.include_router(auth.router)
70
 
71
  origins = [
 
4
  from fastapi import FastAPI, Request
5
  from fastapi.middleware.cors import CORSMiddleware
6
  from jose import jwt
7
+ from router import auth, content, service
8
  from starlette.middleware.base import BaseHTTPMiddleware
9
 
10
  SECRET_KEY = "your-secret-key"
 
65
  app = FastAPI(docs_url="/")
66
 
67
  app.include_router(content.router)
68
+ app.include_router(service.router)
69
  app.include_router(auth.router)
70
 
71
  origins = [
app/models/db/__init__.py CHANGED
@@ -1,12 +1,14 @@
1
  """This module is responsible for initializing the database connection and creating the necessary tables."""
2
  from pinecone import Pinecone, ServerlessSpec
3
  from langchain_pinecone import PineconeVectorStore
4
- from models.llm import EmbeddingsModel
 
5
 
6
- embeddings = EmbeddingsModel("all-MiniLM-L6-v2")
 
7
 
8
  pc = Pinecone()
9
- INDEX_NAME = "mails"
10
  if not pc.has_index(INDEX_NAME):
11
  pc.create_index(
12
  name=INDEX_NAME,
@@ -15,7 +17,7 @@ if not pc.has_index(INDEX_NAME):
15
  spec=ServerlessSpec(
16
  cloud="aws",
17
  region="us-east-1"
18
- )
19
  )
20
  index = pc.Index(INDEX_NAME)
21
- vectorstore = PineconeVectorStore(index=index, embedding=embeddings)
 
1
  """This module is responsible for initializing the database connection and creating the necessary tables."""
2
  from pinecone import Pinecone, ServerlessSpec
3
  from langchain_pinecone import PineconeVectorStore
4
+ # from torch import embedding
5
+ from models.llm import GPTEmbeddings
6
 
7
+ # embeddings = EmbeddingsModel("all-MiniLM-L6-v2")
8
+ embeddings = GPTEmbeddings()
9
 
10
  pc = Pinecone()
11
+ INDEX_NAME = "gmails"
12
  if not pc.has_index(INDEX_NAME):
13
  pc.create_index(
14
  name=INDEX_NAME,
 
17
  spec=ServerlessSpec(
18
  cloud="aws",
19
  region="us-east-1"
20
+ )
21
  )
22
  index = pc.Index(INDEX_NAME)
23
+ vectorstore = PineconeVectorStore(index=index, embedding=embeddings)
app/models/llm/__init__.py CHANGED
@@ -1,13 +1,8 @@
1
  """Module for OpenAI model and embeddings."""
2
- # import os
3
  from typing import List
4
- # import onnxruntime as ort
5
  from langchain.embeddings.base import Embeddings
6
  from sentence_transformers import SentenceTransformer
7
  from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
8
- # from langchain_huggingface import HuggingFacePipeline
9
- # from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
10
- # from huggingface_hub import hf_hub_download
11
 
12
  class GPTModel(AzureChatOpenAI):
13
  """
@@ -40,67 +35,6 @@ class GPTEmbeddings(AzureOpenAIEmbeddings):
40
  Inherits all methods from AzureOpenAIEmbeddings.
41
  """
42
 
43
- # class Phi4MiniONNXLLM:
44
- # """
45
- # A class for interfacing with a pre-trained ONNX model for inference.
46
-
47
- # Attributes:
48
- # session (onnxruntime.InferenceSession): The ONNX runtime inference session for the model.
49
- # input_name (str): The name of the input node in the ONNX model.
50
- # output_name (str): The name of the output node in the ONNX model.
51
-
52
- # Methods:
53
- # __init__(model_path):
54
- # Initializes the Phi4MiniONNXLLM instance by loading the ONNX model from specified path.
55
-
56
- # __call__(input_ids):
57
- # Performs inference on the given input data and returns the model's output.
58
- # """
59
- # def __init__(self, repo_id, subfolder, onnx_file="model.onnx", weights_file="model.onnx.data"):
60
- # self.repo_id = repo_id
61
- # model_path = hf_hub_download(repo_id=repo_id, filename=f"{subfolder}/{onnx_file}")
62
- # weights_path = hf_hub_download(repo_id=repo_id, filename=f"{subfolder}/{weights_file}")
63
- # self.session = ort.InferenceSession(model_path)
64
- # # Verify both files exist
65
- # print(f"Model path: {model_path}, Exists: {os.path.exists(model_path)}")
66
- # print(f"Weights path: {weights_path}, Exists: {os.path.exists(weights_path)}")
67
- # self.input_name = self.session.get_inputs()[0].name
68
- # self.output_name = self.session.get_outputs()[0].name
69
-
70
- # def __call__(self, input_text):
71
- # # Assuming input_ids is a tensor or numpy array
72
- # tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4-mini-instruct-onnx")
73
- # inputs = tokenizer(input_text, return_tensors="pt")
74
- # input_feed = {
75
- # self.input_name: inputs["input_ids"].numpy(),
76
- # "attention_mask": inputs["attention_mask"].numpy(),
77
- # # Add past_key_values if applicable
78
- # }
79
- # outputs = self.session.run([self.output_name], input_feed)
80
- # return outputs
81
-
82
- # class HuggingfaceModel(HuggingFacePipeline):
83
- # """
84
- # HuggingfaceModel is a wrapper class for the Hugging Face text-generation pipeline.
85
-
86
- # Attributes:
87
- # name (str): The name or path of the pre-trained model to load from Hugging Face.
88
- # max_tokens (int): The maximum number of new tokens to generate in the text output.
89
- # Defaults to 200.
90
-
91
- # Methods:
92
- # __init__(name, max_tokens=200):
93
- # Initializes the HuggingfaceModel with the specified model name and maximum token limit.
94
- # """
95
- # def __init__(self, name, max_tokens=500):
96
- # super().__init__(pipeline=pipeline(
97
- # "text-generation",
98
- # model=AutoModelForCausalLM.from_pretrained(name),
99
- # tokenizer=AutoTokenizer.from_pretrained(name),
100
- # max_new_tokens=max_tokens
101
- # )
102
- # )
103
-
104
  class EmbeddingsModel(Embeddings):
105
  """
106
  A model for generating embeddings using SentenceTransformer.
@@ -113,7 +47,7 @@ class EmbeddingsModel(Embeddings):
113
  Initializes the Chroma model with the specified model name.
114
 
115
  Args:
116
- model_name (str): The name of the model to be used for sentence transformation.
117
  """
118
  self.model = SentenceTransformer(model_name)
119
 
@@ -140,18 +74,3 @@ class EmbeddingsModel(Embeddings):
140
  List[float]: The embedded representation of the query as a list of floats.
141
  """
142
  return self.model.encode([query]).tolist()[0]
143
-
144
- # model_name = "microsoft/phi-1_5"
145
- # tokenizer = AutoTokenizer.from_pretrained(model_name)
146
- # model = AutoModelForCausalLM.from_pretrained(model_name)
147
- # pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)
148
-
149
- # phi4_llm = HuggingFacePipeline(pipeline=pipe)
150
-
151
- # tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", pad_token_id=50256)
152
- # model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
153
- # pipe = pipeline(
154
- # "text-generation", model=model, tokenizer=tokenizer,
155
- # max_new_tokens=10, truncation=True, # Truncate input sequences
156
- # )
157
- # phi4_llm = HuggingFacePipeline(pipeline=pipe)
 
1
  """Module for OpenAI model and embeddings."""
 
2
  from typing import List
 
3
  from langchain.embeddings.base import Embeddings
4
  from sentence_transformers import SentenceTransformer
5
  from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
 
 
 
6
 
7
  class GPTModel(AzureChatOpenAI):
8
  """
 
35
  Inherits all methods from AzureOpenAIEmbeddings.
36
  """
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  class EmbeddingsModel(Embeddings):
39
  """
40
  A model for generating embeddings using SentenceTransformer.
 
47
  Initializes the Chroma model with the specified model name.
48
 
49
  Args:
50
+ model_name (str): The name of the model to be used for embedding.
51
  """
52
  self.model = SentenceTransformer(model_name)
53
 
 
74
  List[float]: The embedded representation of the query as a list of floats.
75
  """
76
  return self.model.encode([query]).tolist()[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/router/mail.py DELETED
@@ -1,75 +0,0 @@
1
- """Module for defining the main routes of the API."""
2
- import os
3
- import pickle
4
- import threading
5
- from venv import logger
6
- from fastapi import APIRouter, Request
7
- from fastapi.responses import JSONResponse
8
-
9
- from controllers import mail
10
- from google.oauth2.credentials import Credentials
11
- from googleapiclient.discovery import build
12
-
13
- from schema import MailReqData
14
-
15
- router = APIRouter(prefix="/mail", tags=["mail"])
16
-
17
- @router.post("")
18
- def collect(query: MailReqData, request: Request):
19
- """
20
- Handles the chat POST request.
21
-
22
- Args:
23
- query (ReqData): The request data containing the query parameters.
24
-
25
- Returns:
26
- str: The generated response from the chat function.
27
- """
28
- try:
29
- if os.path.exists(f"cache/{query.email}.pickle"):
30
- with open(f"cache/{query.email}.pickle", "rb") as token:
31
- credentials = pickle.load(token)
32
- else:
33
- cred_dict = request.state.session.get("credential")
34
- credentials = Credentials(
35
- token=cred_dict["token"],
36
- refresh_token=cred_dict["refresh_token"],
37
- token_uri=cred_dict["token_uri"],
38
- client_id=cred_dict["client_id"],
39
- client_secret=cred_dict["client_secret"],
40
- scopes=cred_dict["scopes"],
41
- )
42
- mailservice = build("gmail", "v1", credentials=credentials)
43
- threading.Thread(target=mail.collect, args=(mailservice, query.query)).start()
44
- return JSONResponse(content={"message": "Mail collection in progress."})
45
- except Exception as e:
46
- logger.error("Error collecting mail: %s", e)
47
- return JSONResponse(content={"error": str(e)}, status_code=500)
48
-
49
- @router.get("")
50
- def get(query: MailReqData, request: Request):
51
- """
52
- Handles the chat POST request.
53
-
54
- Args:
55
- query (ReqData): The request data containing the query parameters.
56
-
57
- Returns:
58
- str: The generated response from the chat function.
59
- """
60
- if os.path.exists(f"cache/{query.email}.pickle"):
61
- with open(f"cache/{query.email}.pickle", "rb") as token:
62
- credentials = pickle.load(token)
63
- else:
64
- cred_dict = request.state.session.get("credential")
65
- credentials = Credentials(
66
- token=cred_dict["token"],
67
- refresh_token=cred_dict["refresh_token"],
68
- token_uri=cred_dict["token_uri"],
69
- client_id=cred_dict["client_id"],
70
- client_secret=cred_dict["client_secret"],
71
- scopes=cred_dict["scopes"],
72
- )
73
- mailservice = build("gmail", "v1", credentials=credentials)
74
- result = mail.get_emails(mailservice, query.query, query.query.max_results)
75
- return JSONResponse(content= result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/router/service.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module for defining the main routes of the API."""
2
+ import threading
3
+ from fastapi import APIRouter, Request
4
+ from fastapi.responses import JSONResponse
5
+
6
+ from services import GmailService
7
+
8
+ from schema import EmailQuery
9
+
10
+ router = APIRouter(prefix="/service", tags=["mail"])
11
+
12
+ @router.post("/gmail")
13
+ def collect(query: EmailQuery, request: Request) -> JSONResponse:
14
+ """
15
+ Handles the chat POST request.
16
+
17
+ Args:
18
+ query (ReqData): The request data containing the query parameters.
19
+
20
+ Returns:
21
+ str: The generated response from the chat function.
22
+ """
23
+ service = GmailService(request.headers.get("Google-Token"))
24
+ threading.Thread(target=service.collect, args=[query]).start()
25
+ return JSONResponse(content={"message": "Mail collection in progress."})
26
+
27
+ @router.get("/gmail")
28
+ def get(query: EmailQuery, request: Request) -> JSONResponse:
29
+ """
30
+ Handles the chat POST request.
31
+
32
+ Args:
33
+ query (ReqData): The request data containing the query parameters.
34
+
35
+ Returns:
36
+ str: The generated response from the chat function.
37
+ """
38
+ service = GmailService(request.headers.get("Google-Token"))
39
+ result = service.get(query, query.max_results)
40
+ return JSONResponse(content = result)
app/schema/__init__.py CHANGED
@@ -1,7 +1,10 @@
1
  """Module containing the data models for the application."""
2
- from typing import Optional, List
 
 
3
  from pydantic import BaseModel, Field
4
 
 
5
  class EmailQuery(BaseModel):
6
  """
7
  EmailQuery model representing the structure of an email query.
@@ -11,14 +14,52 @@ class EmailQuery(BaseModel):
11
  from_email (Optional[str]): The sender's email address.
12
  to_email (Optional[str]): The recipient's email address.
13
  cc_email (Optional[str]): The CC email address.
 
 
 
 
14
  after (Optional[str]): The date after which to search for emails.
15
  max_results (Optional[int]): The maximum number of results to return.
16
  """
17
- subject: Optional[str]
18
  from_email: Optional[str] = Field(None, alias="from")
19
  to_email: Optional[str] = Field(None, alias="to")
20
  cc_email: Optional[str] = Field(None, alias="cc")
21
- after: Optional[str]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  max_results: Optional[int] = 10
23
 
24
  class ReqData(BaseModel):
@@ -43,10 +84,8 @@ class MailReqData(BaseModel):
43
  MailReqData is a data model representing the structure of a mail request.
44
 
45
  Attributes:
46
- email (str): The email address of the sender.
47
  query (str): The query or message content sent by the user.
48
  """
49
- email: str
50
  query: EmailQuery
51
 
52
  class ReqFollowUp(BaseModel):
 
1
  """Module containing the data models for the application."""
2
+ from datetime import datetime, timedelta
3
+ from typing import List, Optional
4
+
5
  from pydantic import BaseModel, Field
6
 
7
+
8
  class EmailQuery(BaseModel):
9
  """
10
  EmailQuery model representing the structure of an email query.
 
14
  from_email (Optional[str]): The sender's email address.
15
  to_email (Optional[str]): The recipient's email address.
16
  cc_email (Optional[str]): The CC email address.
17
+ has_words (Optional[str]): Words that the email must contain.
18
+ not_has_words (Optional[str]): Words that the email must not contain.
19
+ size (Optional[int]): The size of the email in bytes.
20
+ date_within (Optional[str]): The date within which to search for emails.
21
  after (Optional[str]): The date after which to search for emails.
22
  max_results (Optional[int]): The maximum number of results to return.
23
  """
24
+ subject: Optional[str] = None
25
  from_email: Optional[str] = Field(None, alias="from")
26
  to_email: Optional[str] = Field(None, alias="to")
27
  cc_email: Optional[str] = Field(None, alias="cc")
28
+ has_words: Optional[str] = None
29
+ not_has_words: Optional[str] = None
30
+ size: Optional[int] = None
31
+ before: Optional[str] = None
32
+ after: Optional[str] = None
33
+
34
+ @classmethod
35
+ def validate_before_after(
36
+ cls, before: Optional[str], after: Optional[str]) -> tuple[Optional[str], Optional[str]]:
37
+ """
38
+ Validates and adjusts the 'before' and 'after' date parameters.
39
+
40
+ This method ensures that the 'before' date is greater than the 'after' date.
41
+ If 'before' is not provided, it defaults to six months prior to the current date.
42
+
43
+ Args:
44
+ before (Optional[str]): The 'before' date in the format "YYYY/MM/DD". Defaults to None.
45
+ after (Optional[str]): The 'after' date in the format "YYYY/MM/DD". Defaults to None.
46
+
47
+ Returns:
48
+ tuple[Optional[str], Optional[str]]:
49
+ A tuple containing the validated 'before' and 'after' dates.
50
+
51
+ Raises:
52
+ ValueError: If the 'before' date is not greater than the 'after' date.
53
+ """
54
+ if after is None:
55
+ after = (datetime.now() - timedelta(days=6 * 30)).strftime("%Y/%m/%d")
56
+ if before and before >= after:
57
+ raise ValueError("The 'before' date must be greater than the 'after' date.")
58
+ return before, after
59
+
60
+ def __init__(self, **data):
61
+ super().__init__(**data)
62
+ self.before, self.after = self.validate_before_after(self.before, self.after)
63
  max_results: Optional[int] = 10
64
 
65
  class ReqData(BaseModel):
 
84
  MailReqData is a data model representing the structure of a mail request.
85
 
86
  Attributes:
 
87
  query (str): The query or message content sent by the user.
88
  """
 
89
  query: EmailQuery
90
 
91
  class ReqFollowUp(BaseModel):
app/services/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Module for database operations."""
2
+ from services.gmail import GmailService
3
+
4
+ __all__ = [
5
+ 'GmailService'
6
+ ]
app/services/gmail.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module provides a utility class, `GmailService`, for interacting with the Gmail API.
3
+ """
4
+ import base64
5
+ import hashlib
6
+ import os
7
+ import re
8
+ from datetime import datetime
9
+ from venv import logger
10
+
11
+ from google.oauth2.credentials import Credentials
12
+ from googleapiclient.discovery import build
13
+ from ics import Calendar
14
+ from langchain_community.document_loaders import (
15
+ CSVLoader,
16
+ PyPDFLoader,
17
+ UnstructuredExcelLoader,
18
+ UnstructuredImageLoader,
19
+ )
20
+ from langchain_core.documents import Document
21
+ from models.db import vectorstore
22
+
23
+ SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
24
+ EMAIL_PATTERN = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
25
+
26
+ ATTACHMENTS_DIR = "cache"
27
+ os.makedirs(ATTACHMENTS_DIR, exist_ok=True)
28
+
29
+ class GmailService():
30
+ """
31
+ GmailService is a utility class for interacting with the Gmail API. It provides methods to
32
+ construct query strings, search for emails, and retrieve detailed email information.
33
+ Methods:
34
+ __init__(token):
35
+ Initializes the GmailService instance with an authenticated Gmail API service.
36
+ build_query(params):
37
+ Constructs a query string based on the provided parameters for filtering emails.
38
+ search(query, max_results=10, check_next_page=False):
39
+ Searches for emails based on a query string and returns a list of message metadata.
40
+ get_emails(query, max_results=10):
41
+ Retrieves a list of emails with detailed information such as subject, sender,
42
+ recipients, and content.
43
+ Attributes:
44
+ service:
45
+ An authenticated Gmail API service instance used to interact with the Gmail API.
46
+ """
47
+ def __init__(self, token):
48
+ """
49
+ Initializes the Gmail controller with the provided token.
50
+
51
+ Args:
52
+ token (str): The auth token used to create credentials for accessing the Gmail API.
53
+ """
54
+ self.service = build("gmail", "v1", credentials=Credentials(token=token))
55
+
56
+ def parse_query(self, params) -> str:
57
+ """
58
+ Constructs a query string based on the provided parameters.
59
+
60
+ Args:
61
+ params (dict): A dictionary containing optional query parameters.
62
+ Supported keys include:
63
+ - 'subject' (str): The subject of the email.
64
+ - 'from' (str): The sender's email address.
65
+ - 'to' (str): The recipient's email address.
66
+ - 'cc' (str): The CC recipient's email address.
67
+ - 'after' (str): A date string to filter emails sent after this date.
68
+ - 'before' (str): A date string to filter emails sent before this date.
69
+
70
+ Returns:
71
+ str: A query string constructed from the provided parameters. Each parameter
72
+ is formatted as a key-value pair and joined by spaces. If a parameter is not
73
+ provided or is empty, it is excluded from the query string.
74
+ """
75
+ query_parts = []
76
+ if 'subject' in params and params['subject']:
77
+ query_parts.append(f'subject:({params["subject"]})')
78
+ if 'from_email' in params and params['from_email']:
79
+ query_parts.append(f'from:({params["from_email"]})')
80
+ if 'to_email' in params and params['to_email']:
81
+ query_parts.append(f'to:({params["to_email"]})')
82
+ if 'cc_email' in params and params['cc_email']:
83
+ query_parts.append(f'cc:({params["cc_email"]})')
84
+ if 'after' in params and params['after']:
85
+ query_parts.append(f'after:{params["after"]}')
86
+ if 'before' in params and params['before']:
87
+ query_parts.append(f'before:{params["before"]}')
88
+ return ' '.join(query_parts)
89
+
90
+ def collect(self, query):
91
+ """
92
+ Main function to search and list emails from Gmail.
93
+
94
+ This function builds a Gmail service, constructs a query to search for emails
95
+ received in the last 14 days, and lists the found emails. If no emails are found,
96
+ it prints a message indicating so.
97
+
98
+ Returns:
99
+ None
100
+ """
101
+ ids = []
102
+ documents = []
103
+ for message in self.search(query):
104
+ msg = self.service.users().messages().get(
105
+ userId="me", id=message["id"], format="full").execute()
106
+ metadata = {}
107
+ metadata["threadId"] = msg["threadId"]
108
+ metadata["msgId"] = msg["id"]
109
+ msg_id = f"{msg['threadId']}-{msg['id']}"
110
+ for header in msg["payload"]["headers"]:
111
+ if header["name"] == "From":
112
+ metadata["from"] = header["value"]
113
+ elif header["name"] == "To":
114
+ metadata["to"] = header["value"]
115
+ elif header["name"] == "Subject":
116
+ metadata["subject"] = header["value"]
117
+ logger.info("subject: %s", metadata["subject"])
118
+ elif header["name"] == "Cc":
119
+ metadata["cc"] = header["value"]
120
+ metadata["date"] = datetime.fromtimestamp(int(msg["internalDate"]) / 1000).strftime(
121
+ "%d/%m/%Y %H:%M:%S"
122
+ )
123
+ metadata["userId"] = self.service.users().getProfile(
124
+ userId="me").execute().get("emailAddress")
125
+ ids = []
126
+ documents = []
127
+ mime_types = []
128
+ if msg["payload"]["mimeType"] in [
129
+ "multipart/alternative",
130
+ "multipart/related",
131
+ "multipart/mixed",
132
+ ]:
133
+ mime_types = []
134
+ attach_docs = []
135
+ for part in msg["payload"]["parts"]:
136
+ mime_types.append(part["mimeType"])
137
+ if part["mimeType"] == "text/plain" and "text/html" not in mime_types:
138
+ body = base64.urlsafe_b64decode(part["body"]["data"]).decode("utf-8")
139
+ body = re.sub(r"<[^>]+>", "", body) # Remove HTML tags
140
+ metadata["mimeType"] = part["mimeType"]
141
+ documents.append(Document(page_content=body, metadata=metadata))
142
+ ids.append(msg["id"])
143
+ elif part["mimeType"] == "text/html" and "text/plain" not in mime_types:
144
+ body = base64.urlsafe_b64decode(part["body"]["data"]).decode("utf-8")
145
+ body = re.sub(r"<[^>]+>", "", body)
146
+ metadata["mimeType"] = part["mimeType"]
147
+ documents.append(Document(page_content=body, metadata=metadata))
148
+ ids.append(msg["id"])
149
+ if part["filename"]:
150
+ attachment_id = part["body"]["attachmentId"]
151
+ logger.info("Downloading attachment: %s", part["filename"])
152
+ attachment = (
153
+ self.service.users()
154
+ .messages()
155
+ .attachments()
156
+ .get(userId="me", messageId=message["id"], id=attachment_id)
157
+ .execute()
158
+ )
159
+ file_data = base64.urlsafe_b64decode(attachment["data"].encode("UTF-8"))
160
+ path = os.path.join(".", ATTACHMENTS_DIR, part["filename"])
161
+ with open(path, "wb") as f:
162
+ f.write(file_data)
163
+ if part["mimeType"] == "application/pdf":
164
+ attach_docs = PyPDFLoader(path).load()
165
+ elif part["mimeType"] == "image/png" or part["mimeType"] == "image/jpeg":
166
+ try:
167
+ attach_docs = UnstructuredImageLoader(path).load()
168
+ except ValueError as e: # Replace with the specific exception type
169
+ logger.error("Error loading image: %s", e)
170
+ elif part["filename"].endswith(".csv"):
171
+ attach_docs = CSVLoader(path).load()
172
+ elif (
173
+ part["mimeType"]
174
+ == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
175
+ ):
176
+ attach_docs = UnstructuredExcelLoader(path).load()
177
+ elif part["mimeType"] == "application/ics":
178
+ with open(path, "r", encoding="utf-8") as f:
179
+ calendar = Calendar(f.read())
180
+ for event in calendar.events:
181
+ documents.append(
182
+ Document(
183
+ page_content=f"Event: {event.name}\n\Description: {event.description}\nStart: {event.begin}\nEnd: {event.end}",
184
+ metadata={
185
+ "attachment": part["filename"],
186
+ "mimeType": part["mimeType"],
187
+ "location": event.location,
188
+ "created": event.created.strftime("%d/%m/%Y %H:%M:%S"),
189
+ "last_modified": event.last_modified.strftime(
190
+ "%d/%m/%Y %H:%M:%S"
191
+ ),
192
+ "start": event.begin.strftime("%d/%m/%Y %H:%M:%S"),
193
+ "end": event.end.strftime("%d/%m/%Y %H:%M:%S"),
194
+ },
195
+ )
196
+ )
197
+ ids.append(
198
+ f"{msg_id}-{part['filename']}-{hashlib.sha256(file_data).hexdigest()}")
199
+ if os.path.exists(path):
200
+ os.remove(path)
201
+ for index, document in enumerate(attach_docs or []):
202
+ document.metadata["mimeType"] = part["mimeType"]
203
+ if "page_label" in document.metadata:
204
+ document.metadata["page"] = document.metadata["page_label"]
205
+ document.metadata["attachment"] = part["filename"]
206
+ document.metadata = {
207
+ key: value
208
+ for key, value in document.metadata.items()
209
+ if key in ["attachment", "page"]
210
+ }
211
+ document.metadata.update(metadata)
212
+ documents.append(document)
213
+ ids.append(f"{msg_id}-{hashlib.sha256(file_data).hexdigest()}-{index}")
214
+ elif msg["payload"]["mimeType"] == "text/plain" and "data" in msg["payload"]["body"]:
215
+ body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
216
+ body = re.sub(r"<[^>]+>", "", body)
217
+ metadata["mimeType"] = msg["payload"]["mimeType"]
218
+ documents.append(Document(page_content=body, metadata=metadata))
219
+ ids.append(msg_id)
220
+ elif msg["payload"]["mimeType"] == "text/html" and "data" in msg["payload"]["body"]:
221
+ body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
222
+ body = re.sub(r"<[^>]+>", "", body)
223
+ metadata["mimeType"] = msg["payload"]["mimeType"]
224
+ documents.append(Document(page_content=body, metadata=metadata))
225
+ ids.append(msg_id)
226
+ if "multipart/alternative" in mime_types and len(mime_types) == 1:
227
+ logger.info("Only multipart/alternative found in the email.")
228
+ else:
229
+ try:
230
+ vectorstore.add_documents(documents=documents, ids=ids)
231
+ except ValueError as e:
232
+ logger.error("Error adding documents to vectorstore: %s", e)
233
+
234
+ def search(self, query, max_results=10, check_next_page=False) -> list:
235
+ """
236
+ Searches for Gmail messages based on a query string.
237
+
238
+ Args:
239
+ query (str): The search query string to filter messages.
240
+ max_results (int, optional): The maximum number of results to retrieve per page.
241
+ check_next_page (bool, optional): if to fetch additional pages of results if available.
242
+
243
+ Returns:
244
+ list: A list of message metadata dict. Each dictionary contains info about a message.
245
+
246
+ Notes:
247
+ - The `query` parameter supports Gmail's advanced search operators.
248
+ - If `check_next_page` is True, will continue fetching messages until all are retrieved.
249
+ """
250
+ query = self.parse_query(query.dict())
251
+ result = self.service.users().messages().list(
252
+ userId='me', q=query, maxResults=max_results).execute()
253
+ messages = []
254
+ if "messages" in result:
255
+ messages.extend(result["messages"])
256
+ while "nextPageToken" in result and check_next_page:
257
+ page_token = result["nextPageToken"]
258
+ result = (
259
+ self.service.users().messages().list(
260
+ userId="me", q=query, maxResults=max_results, pageToken=page_token).execute()
261
+ )
262
+ if "messages" in result:
263
+ messages.extend(result["messages"])
264
+ return messages
265
+
266
+ def get(self, query, max_results=10) -> list:
267
+ """
268
+ Retrieve a list of emails with subject, to, from, cc, and content.
269
+
270
+ Args:
271
+ mailservice: Authenticated Gmail API service instance
272
+ max_results: Maximum number of emails to retrieve
273
+
274
+ Returns:
275
+ List of dictionaries containing email details
276
+ """
277
+ try:
278
+ messages = self.search(query, max_results)
279
+ email_list = []
280
+ if not messages:
281
+ return email_list
282
+ for message in messages:
283
+ msg = self.service.users().messages().get(
284
+ userId='me', id=message['id'], format='full').execute()
285
+ headers = msg['payload']['headers']
286
+ email_data = {
287
+ 'subject': '',
288
+ 'from': '',
289
+ 'to': '',
290
+ 'cc': '',
291
+ 'content': '',
292
+ 'snippet': msg['snippet'] if 'snippet' in msg else '',
293
+ }
294
+ for header in headers:
295
+ name = header['name'].lower()
296
+ if name == 'subject':
297
+ email_data['subject'] = header['value']
298
+ elif name == 'from':
299
+ email_data['from'] = header['value']
300
+ elif name == 'to':
301
+ email_data['to'] = header['value']
302
+ elif name == 'cc':
303
+ email_data['cc'] = header['value']
304
+ if 'parts' in msg['payload']:
305
+ for part in msg['payload']['parts']:
306
+ if part['mimeType'] == 'text/plain':
307
+ email_data['content'] = base64.urlsafe_b64decode(
308
+ part['body']['data']).decode('utf-8')
309
+ break
310
+ elif part['mimeType'] == 'text/html':
311
+ email_data['content'] = base64.urlsafe_b64decode(
312
+ part['body']['data']).decode('utf-8')
313
+ break
314
+ elif 'data' in msg['payload']['body']:
315
+ email_data['content'] = base64.urlsafe_b64decode(
316
+ msg['payload']['body']['data']).decode('utf-8')
317
+ email_list.append(email_data)
318
+ return email_list
319
+
320
+ except (KeyError, ValueError, TypeError) as e:
321
+ logger.info("An error occurred: %s", e)
322
+ return []