Refactor email handling: remove mail module, update service routes, and enhance EmailQuery model with additional parameters
Browse files- .gitignore +1 -0
- app/controllers/__init__.py +0 -0
- app/controllers/mail.py +0 -318
- app/main.py +2 -2
- app/models/db/__init__.py +7 -5
- app/models/llm/__init__.py +1 -82
- app/router/mail.py +0 -75
- app/router/service.py +40 -0
- app/schema/__init__.py +44 -5
- app/services/__init__.py +6 -0
- app/services/gmail.py +322 -0
.gitignore
CHANGED
@@ -183,3 +183,4 @@ models/chroma/data/*.bin
|
|
183 |
models/chroma/_data/chroma.sqlite3
|
184 |
models/chroma/data/chroma.sqlite3
|
185 |
cache
|
|
|
|
183 |
models/chroma/_data/chroma.sqlite3
|
184 |
models/chroma/data/chroma.sqlite3
|
185 |
cache
|
186 |
+
_cache
|
app/controllers/__init__.py
DELETED
File without changes
|
app/controllers/mail.py
DELETED
@@ -1,318 +0,0 @@
|
|
1 |
-
"""Module to search and list emails from Gmail."""
|
2 |
-
import base64
|
3 |
-
import hashlib
|
4 |
-
import os
|
5 |
-
import re
|
6 |
-
from datetime import datetime, timedelta
|
7 |
-
from venv import logger
|
8 |
-
|
9 |
-
from ics import Calendar
|
10 |
-
from langchain_community.document_loaders import (
|
11 |
-
CSVLoader,
|
12 |
-
PyPDFLoader,
|
13 |
-
UnstructuredExcelLoader,
|
14 |
-
UnstructuredImageLoader,
|
15 |
-
)
|
16 |
-
from langchain_core.documents import Document
|
17 |
-
from models.db import vectorstore
|
18 |
-
|
19 |
-
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
|
20 |
-
EMAIL_PATTERN = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
|
21 |
-
|
22 |
-
ATTACHMENTS_DIR = "cache"
|
23 |
-
os.makedirs(ATTACHMENTS_DIR, exist_ok=True)
|
24 |
-
|
25 |
-
def build_query(params):
|
26 |
-
"""
|
27 |
-
Constructs a query string based on the provided parameters.
|
28 |
-
|
29 |
-
Args:
|
30 |
-
params (dict): A dictionary containing optional query parameters.
|
31 |
-
Supported keys include:
|
32 |
-
- 'subject' (str): The subject of the email.
|
33 |
-
- 'from' (str): The sender's email address.
|
34 |
-
- 'to' (str): The recipient's email address.
|
35 |
-
- 'cc' (str): The CC recipient's email address.
|
36 |
-
- 'after' (str): A date string to filter emails sent after this date.
|
37 |
-
- 'before' (str): A date string to filter emails sent before this date.
|
38 |
-
|
39 |
-
Returns:
|
40 |
-
str: A query string constructed from the provided parameters. Each parameter
|
41 |
-
is formatted as a key-value pair and joined by spaces. If a parameter is not
|
42 |
-
provided or is empty, it is excluded from the query string.
|
43 |
-
"""
|
44 |
-
query_parts = []
|
45 |
-
if 'subject' in params and params['subject']:
|
46 |
-
query_parts.append(f'subject:"{params["subject"]}"')
|
47 |
-
if 'from' in params and params['from']:
|
48 |
-
query_parts.append(f'from:{params["from"]}')
|
49 |
-
if 'to' in params and params['to']:
|
50 |
-
query_parts.append(f'to:{params["to"]}')
|
51 |
-
if 'cc' in params and params['cc']:
|
52 |
-
query_parts.append(f'cc:{params["cc"]}')
|
53 |
-
if 'after' in params and params['after']:
|
54 |
-
query_parts.append(f'after:{params["after"]}')
|
55 |
-
if 'before' in params and params['before']:
|
56 |
-
query_parts.append(f'before:{params["before"]}')
|
57 |
-
return ' '.join(query_parts)
|
58 |
-
|
59 |
-
def search_emails(service, query):
|
60 |
-
"""Search emails based on a query."""
|
61 |
-
result = service.users().messages().list(userId="me", q=query).execute()
|
62 |
-
messages = []
|
63 |
-
if "messages" in result:
|
64 |
-
messages.extend(result["messages"])
|
65 |
-
while "nextPageToken" in result:
|
66 |
-
page_token = result["nextPageToken"]
|
67 |
-
result = (
|
68 |
-
service.users().messages().list(userId="me", q=query, pageToken=page_token).execute()
|
69 |
-
)
|
70 |
-
if "messages" in result:
|
71 |
-
messages.extend(result["messages"])
|
72 |
-
return messages
|
73 |
-
|
74 |
-
|
75 |
-
def list_emails(service, messages):
|
76 |
-
"""
|
77 |
-
Processes a list of email messages, extracts metadata, decodes content, and handles attachments.
|
78 |
-
|
79 |
-
Args:
|
80 |
-
messages (list): A list of email message dictionaries, where each dictionary contains
|
81 |
-
at least an 'id' key representing the email's unique identifier.
|
82 |
-
|
83 |
-
Returns:
|
84 |
-
None: The function processes the emails and adds the extracted documents to a vector store.
|
85 |
-
|
86 |
-
Functionality:
|
87 |
-
- Retrieves email details using the Gmail API.
|
88 |
-
- Extracts metadata such as sender, recipient, subject, CC, and date.
|
89 |
-
- Decodes email content in plain text or HTML format.
|
90 |
-
- Handles multipart emails, including attachments.
|
91 |
-
- Processes attachments based on their MIME type:
|
92 |
-
- PDF files are loaded using PyPDFLoader.
|
93 |
-
- Images (PNG, JPEG) are loaded using UnstructuredImageLoader.
|
94 |
-
- CSV files are loaded using CSVLoader.
|
95 |
-
- Excel files are loaded using UnstructuredExcelLoader.
|
96 |
-
- Calendar files (ICS) are parsed to extract event details.
|
97 |
-
- Removes HTML tags from email content.
|
98 |
-
- Stores processed documents and metadata in a vector store.
|
99 |
-
- Deletes temporary files created during attachment processing.
|
100 |
-
|
101 |
-
Notes:
|
102 |
-
- The function assumes the existence of a global `service` object for Gmail API.
|
103 |
-
- The `vectorstore.add_documents` method is used to store the processed documents.
|
104 |
-
- Attachments are temporarily saved in `ATTACHMENTS_DIR` and deleted after processing.
|
105 |
-
- The function logs information about attachments being downloaded.
|
106 |
-
"""
|
107 |
-
ids = []
|
108 |
-
documents = []
|
109 |
-
for message in messages:
|
110 |
-
msg = service.users().messages().get(userId="me", id=message["id"], format="full").execute()
|
111 |
-
metadata = {}
|
112 |
-
metadata["threadId"] = msg["threadId"]
|
113 |
-
metadata["msgId"] = msg["id"]
|
114 |
-
msgId = f"{msg['threadId']}-{msg['id']}"
|
115 |
-
for header in msg["payload"]["headers"]:
|
116 |
-
if header["name"] == "From":
|
117 |
-
metadata["from"] = header["value"]
|
118 |
-
elif header["name"] == "To":
|
119 |
-
metadata["to"] = header["value"]
|
120 |
-
elif header["name"] == "Subject":
|
121 |
-
metadata["subject"] = header["value"]
|
122 |
-
logger.info("subject: %s", metadata["subject"])
|
123 |
-
elif header["name"] == "Cc":
|
124 |
-
metadata["cc"] = header["value"]
|
125 |
-
metadata["date"] = datetime.fromtimestamp(int(msg["internalDate"]) / 1000).strftime(
|
126 |
-
"%d/%m/%Y %H:%M:%S"
|
127 |
-
)
|
128 |
-
metadata["userId"] = service.users().getProfile(userId="me").execute().get("emailAddress")
|
129 |
-
ids = []
|
130 |
-
documents = []
|
131 |
-
mime_types = []
|
132 |
-
if msg["payload"]["mimeType"] in [
|
133 |
-
"multipart/alternative",
|
134 |
-
"multipart/related",
|
135 |
-
"multipart/mixed",
|
136 |
-
]:
|
137 |
-
mime_types = []
|
138 |
-
attach_docs = []
|
139 |
-
for part in msg["payload"]["parts"]:
|
140 |
-
print("mimeType: ", part["mimeType"])
|
141 |
-
mime_types.append(part["mimeType"])
|
142 |
-
if part["mimeType"] == "text/plain" and "text/html" not in mime_types:
|
143 |
-
body = base64.urlsafe_b64decode(part["body"]["data"]).decode("utf-8")
|
144 |
-
body = re.sub(r"<[^>]+>", "", body) # Remove HTML tags
|
145 |
-
metadata["mimeType"] = part["mimeType"]
|
146 |
-
documents.append(Document(page_content=body, metadata=metadata))
|
147 |
-
ids.append(msg["id"])
|
148 |
-
elif part["mimeType"] == "text/html" and "text/plain" not in mime_types:
|
149 |
-
body = base64.urlsafe_b64decode(part["body"]["data"]).decode("utf-8")
|
150 |
-
body = re.sub(r"<[^>]+>", "", body)
|
151 |
-
metadata["mimeType"] = part["mimeType"]
|
152 |
-
documents.append(Document(page_content=body, metadata=metadata))
|
153 |
-
ids.append(msg["id"])
|
154 |
-
if part["filename"]:
|
155 |
-
attachment_id = part["body"]["attachmentId"]
|
156 |
-
logger.info("Downloading attachment: %s", part["filename"])
|
157 |
-
attachment = (
|
158 |
-
service.users()
|
159 |
-
.messages()
|
160 |
-
.attachments()
|
161 |
-
.get(userId="me", messageId=message["id"], id=attachment_id)
|
162 |
-
.execute()
|
163 |
-
)
|
164 |
-
file_data = base64.urlsafe_b64decode(attachment["data"].encode("UTF-8"))
|
165 |
-
path = os.path.join(".", ATTACHMENTS_DIR, part["filename"])
|
166 |
-
with open(path, "wb") as f:
|
167 |
-
f.write(file_data)
|
168 |
-
if part["mimeType"] == "application/pdf":
|
169 |
-
attach_docs = PyPDFLoader(path).load()
|
170 |
-
elif part["mimeType"] == "image/png" or part["mimeType"] == "image/jpeg":
|
171 |
-
try:
|
172 |
-
attach_docs = UnstructuredImageLoader(path).load()
|
173 |
-
except Exception as e:
|
174 |
-
logger.error("Error loading image: %s", e)
|
175 |
-
elif part["filename"].endswith(".csv"):
|
176 |
-
attach_docs = CSVLoader(path).load()
|
177 |
-
elif (
|
178 |
-
part["mimeType"]
|
179 |
-
== "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
180 |
-
):
|
181 |
-
attach_docs = UnstructuredExcelLoader(path).load()
|
182 |
-
elif part["mimeType"] == "application/ics":
|
183 |
-
with open(path, "r", encoding="utf-8") as f:
|
184 |
-
calendar = Calendar(f.read())
|
185 |
-
for event in calendar.events:
|
186 |
-
documents.append(
|
187 |
-
Document(
|
188 |
-
page_content=f"Event: {event.name}\n\Description: {event.description}\nStart: {event.begin}\nEnd: {event.end}",
|
189 |
-
metadata={
|
190 |
-
"attachment": part["filename"],
|
191 |
-
"mimeType": part["mimeType"],
|
192 |
-
"location": event.location,
|
193 |
-
"created": event.created.strftime("%d/%m/%Y %H:%M:%S"),
|
194 |
-
"last_modified": event.last_modified.strftime(
|
195 |
-
"%d/%m/%Y %H:%M:%S"
|
196 |
-
),
|
197 |
-
"start": event.begin.strftime("%d/%m/%Y %H:%M:%S"),
|
198 |
-
"end": event.end.strftime("%d/%m/%Y %H:%M:%S"),
|
199 |
-
},
|
200 |
-
)
|
201 |
-
)
|
202 |
-
ids.append(f"{msgId}-{part['filename']}-{hashlib.sha256(file_data).hexdigest()}")
|
203 |
-
if os.path.exists(path):
|
204 |
-
os.remove(path)
|
205 |
-
for index, document in enumerate(attach_docs or []):
|
206 |
-
document.metadata["mimeType"] = part["mimeType"]
|
207 |
-
if "page_label" in document.metadata:
|
208 |
-
document.metadata["page"] = document.metadata["page_label"]
|
209 |
-
document.metadata["attachment"] = part["filename"]
|
210 |
-
document.metadata = {
|
211 |
-
key: value
|
212 |
-
for key, value in document.metadata.items()
|
213 |
-
if key in ["attachment", "page"]
|
214 |
-
}
|
215 |
-
document.metadata.update(metadata)
|
216 |
-
documents.append(document)
|
217 |
-
ids.append(f"{msgId}-{hashlib.sha256(file_data).hexdigest()}-{index}")
|
218 |
-
elif msg["payload"]["mimeType"] == "text/plain" and "data" in msg["payload"]["body"]:
|
219 |
-
body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
|
220 |
-
body = re.sub(r"<[^>]+>", "", body)
|
221 |
-
metadata["mimeType"] = msg["payload"]["mimeType"]
|
222 |
-
documents.append(Document(page_content=body, metadata=metadata))
|
223 |
-
ids.append(msgId)
|
224 |
-
elif msg["payload"]["mimeType"] == "text/html" and "data" in msg["payload"]["body"]:
|
225 |
-
body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
|
226 |
-
body = re.sub(r"<[^>]+>", "", body)
|
227 |
-
metadata["mimeType"] = msg["payload"]["mimeType"]
|
228 |
-
documents.append(Document(page_content=body, metadata=metadata))
|
229 |
-
ids.append(msgId)
|
230 |
-
if "multipart/alternative" in mime_types and len(mime_types) == 1:
|
231 |
-
print("Only multipart/alternative found in the email.")
|
232 |
-
else:
|
233 |
-
try:
|
234 |
-
vectorstore.add_documents(documents=documents, ids=ids)
|
235 |
-
except Exception as e:
|
236 |
-
logger.error("Error adding documents to vectorstore: %s", e)
|
237 |
-
|
238 |
-
|
239 |
-
def collect(service, query=(datetime.today() - timedelta(days=10)).strftime("after:%Y/%m/%d")):
|
240 |
-
"""
|
241 |
-
Main function to search and list emails from Gmail.
|
242 |
-
|
243 |
-
This function builds a Gmail service, constructs a query to search for emails
|
244 |
-
received in the last 14 days, and lists the found emails. If no emails are found,
|
245 |
-
it prints a message indicating so.
|
246 |
-
|
247 |
-
Returns:
|
248 |
-
None
|
249 |
-
"""
|
250 |
-
# query = "subject:Re: Smartcareers algorithm debug and improvement'"
|
251 |
-
emails = search_emails(service, query)
|
252 |
-
if emails:
|
253 |
-
logger.info("Found %d emails:\n", len(emails))
|
254 |
-
logger.info("Found %d emails after two_weeks_ago:\n", len(emails))
|
255 |
-
list_emails(service, emails)
|
256 |
-
logger.info("Listing emails...")
|
257 |
-
return f"{len(emails)} emails added to the collection."
|
258 |
-
else:
|
259 |
-
logger.info("No emails found after two weeks ago.")
|
260 |
-
|
261 |
-
|
262 |
-
def get_emails(service, query, max_results=10):
|
263 |
-
"""
|
264 |
-
Retrieve a list of emails with subject, to, from, cc, and content.
|
265 |
-
|
266 |
-
Args:
|
267 |
-
mailservice: Authenticated Gmail API service instance
|
268 |
-
max_results: Maximum number of emails to retrieve
|
269 |
-
|
270 |
-
Returns:
|
271 |
-
List of dictionaries containing email details
|
272 |
-
"""
|
273 |
-
try:
|
274 |
-
# List messages
|
275 |
-
query = build_query(query.dict())
|
276 |
-
response = service.users().messages().list(
|
277 |
-
userId='me', q=query, maxResults=max_results).execute()
|
278 |
-
messages = response.get('messages', [])
|
279 |
-
email_list = []
|
280 |
-
if not messages:
|
281 |
-
return email_list
|
282 |
-
for message in messages:
|
283 |
-
msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()
|
284 |
-
headers = msg['payload']['headers']
|
285 |
-
email_data = {
|
286 |
-
'subject': '',
|
287 |
-
'from': '',
|
288 |
-
'to': '',
|
289 |
-
'cc': '',
|
290 |
-
'content': '',
|
291 |
-
'snippet': msg['snippet'] if 'snippet' in msg else '',
|
292 |
-
}
|
293 |
-
for header in headers:
|
294 |
-
name = header['name'].lower()
|
295 |
-
if name == 'subject':
|
296 |
-
email_data['subject'] = header['value']
|
297 |
-
elif name == 'from':
|
298 |
-
email_data['from'] = header['value']
|
299 |
-
elif name == 'to':
|
300 |
-
email_data['to'] = header['value']
|
301 |
-
elif name == 'cc':
|
302 |
-
email_data['cc'] = header['value']
|
303 |
-
if 'parts' in msg['payload']:
|
304 |
-
for part in msg['payload']['parts']:
|
305 |
-
if part['mimeType'] == 'text/plain':
|
306 |
-
email_data['content'] = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
|
307 |
-
break
|
308 |
-
elif part['mimeType'] == 'text/html':
|
309 |
-
email_data['content'] = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
|
310 |
-
break
|
311 |
-
elif 'data' in msg['payload']['body']:
|
312 |
-
email_data['content'] = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
|
313 |
-
email_list.append(email_data)
|
314 |
-
return email_list
|
315 |
-
|
316 |
-
except Exception as e:
|
317 |
-
print(f"An error occurred: {e}")
|
318 |
-
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/main.py
CHANGED
@@ -4,7 +4,7 @@ import logging
|
|
4 |
from fastapi import FastAPI, Request
|
5 |
from fastapi.middleware.cors import CORSMiddleware
|
6 |
from jose import jwt
|
7 |
-
from router import auth, content,
|
8 |
from starlette.middleware.base import BaseHTTPMiddleware
|
9 |
|
10 |
SECRET_KEY = "your-secret-key"
|
@@ -65,7 +65,7 @@ logging.getLogger().setLevel(logging.INFO)
|
|
65 |
app = FastAPI(docs_url="/")
|
66 |
|
67 |
app.include_router(content.router)
|
68 |
-
app.include_router(
|
69 |
app.include_router(auth.router)
|
70 |
|
71 |
origins = [
|
|
|
4 |
from fastapi import FastAPI, Request
|
5 |
from fastapi.middleware.cors import CORSMiddleware
|
6 |
from jose import jwt
|
7 |
+
from router import auth, content, service
|
8 |
from starlette.middleware.base import BaseHTTPMiddleware
|
9 |
|
10 |
SECRET_KEY = "your-secret-key"
|
|
|
65 |
app = FastAPI(docs_url="/")
|
66 |
|
67 |
app.include_router(content.router)
|
68 |
+
app.include_router(service.router)
|
69 |
app.include_router(auth.router)
|
70 |
|
71 |
origins = [
|
app/models/db/__init__.py
CHANGED
@@ -1,12 +1,14 @@
|
|
1 |
"""This module is responsible for initializing the database connection and creating the necessary tables."""
|
2 |
from pinecone import Pinecone, ServerlessSpec
|
3 |
from langchain_pinecone import PineconeVectorStore
|
4 |
-
from
|
|
|
5 |
|
6 |
-
embeddings = EmbeddingsModel("all-MiniLM-L6-v2")
|
|
|
7 |
|
8 |
pc = Pinecone()
|
9 |
-
INDEX_NAME = "
|
10 |
if not pc.has_index(INDEX_NAME):
|
11 |
pc.create_index(
|
12 |
name=INDEX_NAME,
|
@@ -15,7 +17,7 @@ if not pc.has_index(INDEX_NAME):
|
|
15 |
spec=ServerlessSpec(
|
16 |
cloud="aws",
|
17 |
region="us-east-1"
|
18 |
-
)
|
19 |
)
|
20 |
index = pc.Index(INDEX_NAME)
|
21 |
-
vectorstore = PineconeVectorStore(index=index, embedding=embeddings)
|
|
|
1 |
"""This module is responsible for initializing the database connection and creating the necessary tables."""
|
2 |
from pinecone import Pinecone, ServerlessSpec
|
3 |
from langchain_pinecone import PineconeVectorStore
|
4 |
+
# from torch import embedding
|
5 |
+
from models.llm import GPTEmbeddings
|
6 |
|
7 |
+
# embeddings = EmbeddingsModel("all-MiniLM-L6-v2")
|
8 |
+
embeddings = GPTEmbeddings()
|
9 |
|
10 |
pc = Pinecone()
|
11 |
+
INDEX_NAME = "gmails"
|
12 |
if not pc.has_index(INDEX_NAME):
|
13 |
pc.create_index(
|
14 |
name=INDEX_NAME,
|
|
|
17 |
spec=ServerlessSpec(
|
18 |
cloud="aws",
|
19 |
region="us-east-1"
|
20 |
+
)
|
21 |
)
|
22 |
index = pc.Index(INDEX_NAME)
|
23 |
+
vectorstore = PineconeVectorStore(index=index, embedding=embeddings)
|
app/models/llm/__init__.py
CHANGED
@@ -1,13 +1,8 @@
|
|
1 |
"""Module for OpenAI model and embeddings."""
|
2 |
-
# import os
|
3 |
from typing import List
|
4 |
-
# import onnxruntime as ort
|
5 |
from langchain.embeddings.base import Embeddings
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
|
8 |
-
# from langchain_huggingface import HuggingFacePipeline
|
9 |
-
# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
10 |
-
# from huggingface_hub import hf_hub_download
|
11 |
|
12 |
class GPTModel(AzureChatOpenAI):
|
13 |
"""
|
@@ -40,67 +35,6 @@ class GPTEmbeddings(AzureOpenAIEmbeddings):
|
|
40 |
Inherits all methods from AzureOpenAIEmbeddings.
|
41 |
"""
|
42 |
|
43 |
-
# class Phi4MiniONNXLLM:
|
44 |
-
# """
|
45 |
-
# A class for interfacing with a pre-trained ONNX model for inference.
|
46 |
-
|
47 |
-
# Attributes:
|
48 |
-
# session (onnxruntime.InferenceSession): The ONNX runtime inference session for the model.
|
49 |
-
# input_name (str): The name of the input node in the ONNX model.
|
50 |
-
# output_name (str): The name of the output node in the ONNX model.
|
51 |
-
|
52 |
-
# Methods:
|
53 |
-
# __init__(model_path):
|
54 |
-
# Initializes the Phi4MiniONNXLLM instance by loading the ONNX model from specified path.
|
55 |
-
|
56 |
-
# __call__(input_ids):
|
57 |
-
# Performs inference on the given input data and returns the model's output.
|
58 |
-
# """
|
59 |
-
# def __init__(self, repo_id, subfolder, onnx_file="model.onnx", weights_file="model.onnx.data"):
|
60 |
-
# self.repo_id = repo_id
|
61 |
-
# model_path = hf_hub_download(repo_id=repo_id, filename=f"{subfolder}/{onnx_file}")
|
62 |
-
# weights_path = hf_hub_download(repo_id=repo_id, filename=f"{subfolder}/{weights_file}")
|
63 |
-
# self.session = ort.InferenceSession(model_path)
|
64 |
-
# # Verify both files exist
|
65 |
-
# print(f"Model path: {model_path}, Exists: {os.path.exists(model_path)}")
|
66 |
-
# print(f"Weights path: {weights_path}, Exists: {os.path.exists(weights_path)}")
|
67 |
-
# self.input_name = self.session.get_inputs()[0].name
|
68 |
-
# self.output_name = self.session.get_outputs()[0].name
|
69 |
-
|
70 |
-
# def __call__(self, input_text):
|
71 |
-
# # Assuming input_ids is a tensor or numpy array
|
72 |
-
# tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4-mini-instruct-onnx")
|
73 |
-
# inputs = tokenizer(input_text, return_tensors="pt")
|
74 |
-
# input_feed = {
|
75 |
-
# self.input_name: inputs["input_ids"].numpy(),
|
76 |
-
# "attention_mask": inputs["attention_mask"].numpy(),
|
77 |
-
# # Add past_key_values if applicable
|
78 |
-
# }
|
79 |
-
# outputs = self.session.run([self.output_name], input_feed)
|
80 |
-
# return outputs
|
81 |
-
|
82 |
-
# class HuggingfaceModel(HuggingFacePipeline):
|
83 |
-
# """
|
84 |
-
# HuggingfaceModel is a wrapper class for the Hugging Face text-generation pipeline.
|
85 |
-
|
86 |
-
# Attributes:
|
87 |
-
# name (str): The name or path of the pre-trained model to load from Hugging Face.
|
88 |
-
# max_tokens (int): The maximum number of new tokens to generate in the text output.
|
89 |
-
# Defaults to 200.
|
90 |
-
|
91 |
-
# Methods:
|
92 |
-
# __init__(name, max_tokens=200):
|
93 |
-
# Initializes the HuggingfaceModel with the specified model name and maximum token limit.
|
94 |
-
# """
|
95 |
-
# def __init__(self, name, max_tokens=500):
|
96 |
-
# super().__init__(pipeline=pipeline(
|
97 |
-
# "text-generation",
|
98 |
-
# model=AutoModelForCausalLM.from_pretrained(name),
|
99 |
-
# tokenizer=AutoTokenizer.from_pretrained(name),
|
100 |
-
# max_new_tokens=max_tokens
|
101 |
-
# )
|
102 |
-
# )
|
103 |
-
|
104 |
class EmbeddingsModel(Embeddings):
|
105 |
"""
|
106 |
A model for generating embeddings using SentenceTransformer.
|
@@ -113,7 +47,7 @@ class EmbeddingsModel(Embeddings):
|
|
113 |
Initializes the Chroma model with the specified model name.
|
114 |
|
115 |
Args:
|
116 |
-
model_name (str): The name of the model to be used for
|
117 |
"""
|
118 |
self.model = SentenceTransformer(model_name)
|
119 |
|
@@ -140,18 +74,3 @@ class EmbeddingsModel(Embeddings):
|
|
140 |
List[float]: The embedded representation of the query as a list of floats.
|
141 |
"""
|
142 |
return self.model.encode([query]).tolist()[0]
|
143 |
-
|
144 |
-
# model_name = "microsoft/phi-1_5"
|
145 |
-
# tokenizer = AutoTokenizer.from_pretrained(model_name)
|
146 |
-
# model = AutoModelForCausalLM.from_pretrained(model_name)
|
147 |
-
# pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)
|
148 |
-
|
149 |
-
# phi4_llm = HuggingFacePipeline(pipeline=pipe)
|
150 |
-
|
151 |
-
# tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", pad_token_id=50256)
|
152 |
-
# model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
|
153 |
-
# pipe = pipeline(
|
154 |
-
# "text-generation", model=model, tokenizer=tokenizer,
|
155 |
-
# max_new_tokens=10, truncation=True, # Truncate input sequences
|
156 |
-
# )
|
157 |
-
# phi4_llm = HuggingFacePipeline(pipeline=pipe)
|
|
|
1 |
"""Module for OpenAI model and embeddings."""
|
|
|
2 |
from typing import List
|
|
|
3 |
from langchain.embeddings.base import Embeddings
|
4 |
from sentence_transformers import SentenceTransformer
|
5 |
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
|
|
|
|
|
|
|
6 |
|
7 |
class GPTModel(AzureChatOpenAI):
|
8 |
"""
|
|
|
35 |
Inherits all methods from AzureOpenAIEmbeddings.
|
36 |
"""
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
class EmbeddingsModel(Embeddings):
|
39 |
"""
|
40 |
A model for generating embeddings using SentenceTransformer.
|
|
|
47 |
Initializes the Chroma model with the specified model name.
|
48 |
|
49 |
Args:
|
50 |
+
model_name (str): The name of the model to be used for embedding.
|
51 |
"""
|
52 |
self.model = SentenceTransformer(model_name)
|
53 |
|
|
|
74 |
List[float]: The embedded representation of the query as a list of floats.
|
75 |
"""
|
76 |
return self.model.encode([query]).tolist()[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/router/mail.py
DELETED
@@ -1,75 +0,0 @@
|
|
1 |
-
"""Module for defining the main routes of the API."""
|
2 |
-
import os
|
3 |
-
import pickle
|
4 |
-
import threading
|
5 |
-
from venv import logger
|
6 |
-
from fastapi import APIRouter, Request
|
7 |
-
from fastapi.responses import JSONResponse
|
8 |
-
|
9 |
-
from controllers import mail
|
10 |
-
from google.oauth2.credentials import Credentials
|
11 |
-
from googleapiclient.discovery import build
|
12 |
-
|
13 |
-
from schema import MailReqData
|
14 |
-
|
15 |
-
router = APIRouter(prefix="/mail", tags=["mail"])
|
16 |
-
|
17 |
-
@router.post("")
|
18 |
-
def collect(query: MailReqData, request: Request):
|
19 |
-
"""
|
20 |
-
Handles the chat POST request.
|
21 |
-
|
22 |
-
Args:
|
23 |
-
query (ReqData): The request data containing the query parameters.
|
24 |
-
|
25 |
-
Returns:
|
26 |
-
str: The generated response from the chat function.
|
27 |
-
"""
|
28 |
-
try:
|
29 |
-
if os.path.exists(f"cache/{query.email}.pickle"):
|
30 |
-
with open(f"cache/{query.email}.pickle", "rb") as token:
|
31 |
-
credentials = pickle.load(token)
|
32 |
-
else:
|
33 |
-
cred_dict = request.state.session.get("credential")
|
34 |
-
credentials = Credentials(
|
35 |
-
token=cred_dict["token"],
|
36 |
-
refresh_token=cred_dict["refresh_token"],
|
37 |
-
token_uri=cred_dict["token_uri"],
|
38 |
-
client_id=cred_dict["client_id"],
|
39 |
-
client_secret=cred_dict["client_secret"],
|
40 |
-
scopes=cred_dict["scopes"],
|
41 |
-
)
|
42 |
-
mailservice = build("gmail", "v1", credentials=credentials)
|
43 |
-
threading.Thread(target=mail.collect, args=(mailservice, query.query)).start()
|
44 |
-
return JSONResponse(content={"message": "Mail collection in progress."})
|
45 |
-
except Exception as e:
|
46 |
-
logger.error("Error collecting mail: %s", e)
|
47 |
-
return JSONResponse(content={"error": str(e)}, status_code=500)
|
48 |
-
|
49 |
-
@router.get("")
|
50 |
-
def get(query: MailReqData, request: Request):
|
51 |
-
"""
|
52 |
-
Handles the chat POST request.
|
53 |
-
|
54 |
-
Args:
|
55 |
-
query (ReqData): The request data containing the query parameters.
|
56 |
-
|
57 |
-
Returns:
|
58 |
-
str: The generated response from the chat function.
|
59 |
-
"""
|
60 |
-
if os.path.exists(f"cache/{query.email}.pickle"):
|
61 |
-
with open(f"cache/{query.email}.pickle", "rb") as token:
|
62 |
-
credentials = pickle.load(token)
|
63 |
-
else:
|
64 |
-
cred_dict = request.state.session.get("credential")
|
65 |
-
credentials = Credentials(
|
66 |
-
token=cred_dict["token"],
|
67 |
-
refresh_token=cred_dict["refresh_token"],
|
68 |
-
token_uri=cred_dict["token_uri"],
|
69 |
-
client_id=cred_dict["client_id"],
|
70 |
-
client_secret=cred_dict["client_secret"],
|
71 |
-
scopes=cred_dict["scopes"],
|
72 |
-
)
|
73 |
-
mailservice = build("gmail", "v1", credentials=credentials)
|
74 |
-
result = mail.get_emails(mailservice, query.query, query.query.max_results)
|
75 |
-
return JSONResponse(content= result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/router/service.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Module for defining the main routes of the API."""
|
2 |
+
import threading
|
3 |
+
from fastapi import APIRouter, Request
|
4 |
+
from fastapi.responses import JSONResponse
|
5 |
+
|
6 |
+
from services import GmailService
|
7 |
+
|
8 |
+
from schema import EmailQuery
|
9 |
+
|
10 |
+
router = APIRouter(prefix="/service", tags=["mail"])
|
11 |
+
|
12 |
+
@router.post("/gmail")
|
13 |
+
def collect(query: EmailQuery, request: Request) -> JSONResponse:
|
14 |
+
"""
|
15 |
+
Handles the chat POST request.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
query (ReqData): The request data containing the query parameters.
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
str: The generated response from the chat function.
|
22 |
+
"""
|
23 |
+
service = GmailService(request.headers.get("Google-Token"))
|
24 |
+
threading.Thread(target=service.collect, args=[query]).start()
|
25 |
+
return JSONResponse(content={"message": "Mail collection in progress."})
|
26 |
+
|
27 |
+
@router.get("/gmail")
|
28 |
+
def get(query: EmailQuery, request: Request) -> JSONResponse:
|
29 |
+
"""
|
30 |
+
Handles the chat POST request.
|
31 |
+
|
32 |
+
Args:
|
33 |
+
query (ReqData): The request data containing the query parameters.
|
34 |
+
|
35 |
+
Returns:
|
36 |
+
str: The generated response from the chat function.
|
37 |
+
"""
|
38 |
+
service = GmailService(request.headers.get("Google-Token"))
|
39 |
+
result = service.get(query, query.max_results)
|
40 |
+
return JSONResponse(content = result)
|
app/schema/__init__.py
CHANGED
@@ -1,7 +1,10 @@
|
|
1 |
"""Module containing the data models for the application."""
|
2 |
-
from
|
|
|
|
|
3 |
from pydantic import BaseModel, Field
|
4 |
|
|
|
5 |
class EmailQuery(BaseModel):
|
6 |
"""
|
7 |
EmailQuery model representing the structure of an email query.
|
@@ -11,14 +14,52 @@ class EmailQuery(BaseModel):
|
|
11 |
from_email (Optional[str]): The sender's email address.
|
12 |
to_email (Optional[str]): The recipient's email address.
|
13 |
cc_email (Optional[str]): The CC email address.
|
|
|
|
|
|
|
|
|
14 |
after (Optional[str]): The date after which to search for emails.
|
15 |
max_results (Optional[int]): The maximum number of results to return.
|
16 |
"""
|
17 |
-
subject: Optional[str]
|
18 |
from_email: Optional[str] = Field(None, alias="from")
|
19 |
to_email: Optional[str] = Field(None, alias="to")
|
20 |
cc_email: Optional[str] = Field(None, alias="cc")
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
max_results: Optional[int] = 10
|
23 |
|
24 |
class ReqData(BaseModel):
|
@@ -43,10 +84,8 @@ class MailReqData(BaseModel):
|
|
43 |
MailReqData is a data model representing the structure of a mail request.
|
44 |
|
45 |
Attributes:
|
46 |
-
email (str): The email address of the sender.
|
47 |
query (str): The query or message content sent by the user.
|
48 |
"""
|
49 |
-
email: str
|
50 |
query: EmailQuery
|
51 |
|
52 |
class ReqFollowUp(BaseModel):
|
|
|
1 |
"""Module containing the data models for the application."""
|
2 |
+
from datetime import datetime, timedelta
|
3 |
+
from typing import List, Optional
|
4 |
+
|
5 |
from pydantic import BaseModel, Field
|
6 |
|
7 |
+
|
8 |
class EmailQuery(BaseModel):
|
9 |
"""
|
10 |
EmailQuery model representing the structure of an email query.
|
|
|
14 |
from_email (Optional[str]): The sender's email address.
|
15 |
to_email (Optional[str]): The recipient's email address.
|
16 |
cc_email (Optional[str]): The CC email address.
|
17 |
+
has_words (Optional[str]): Words that the email must contain.
|
18 |
+
not_has_words (Optional[str]): Words that the email must not contain.
|
19 |
+
size (Optional[int]): The size of the email in bytes.
|
20 |
+
date_within (Optional[str]): The date within which to search for emails.
|
21 |
after (Optional[str]): The date after which to search for emails.
|
22 |
max_results (Optional[int]): The maximum number of results to return.
|
23 |
"""
|
24 |
+
subject: Optional[str] = None
|
25 |
from_email: Optional[str] = Field(None, alias="from")
|
26 |
to_email: Optional[str] = Field(None, alias="to")
|
27 |
cc_email: Optional[str] = Field(None, alias="cc")
|
28 |
+
has_words: Optional[str] = None
|
29 |
+
not_has_words: Optional[str] = None
|
30 |
+
size: Optional[int] = None
|
31 |
+
before: Optional[str] = None
|
32 |
+
after: Optional[str] = None
|
33 |
+
|
34 |
+
@classmethod
|
35 |
+
def validate_before_after(
|
36 |
+
cls, before: Optional[str], after: Optional[str]) -> tuple[Optional[str], Optional[str]]:
|
37 |
+
"""
|
38 |
+
Validates and adjusts the 'before' and 'after' date parameters.
|
39 |
+
|
40 |
+
This method ensures that the 'before' date is greater than the 'after' date.
|
41 |
+
If 'before' is not provided, it defaults to six months prior to the current date.
|
42 |
+
|
43 |
+
Args:
|
44 |
+
before (Optional[str]): The 'before' date in the format "YYYY/MM/DD". Defaults to None.
|
45 |
+
after (Optional[str]): The 'after' date in the format "YYYY/MM/DD". Defaults to None.
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
tuple[Optional[str], Optional[str]]:
|
49 |
+
A tuple containing the validated 'before' and 'after' dates.
|
50 |
+
|
51 |
+
Raises:
|
52 |
+
ValueError: If the 'before' date is not greater than the 'after' date.
|
53 |
+
"""
|
54 |
+
if after is None:
|
55 |
+
after = (datetime.now() - timedelta(days=6 * 30)).strftime("%Y/%m/%d")
|
56 |
+
if before and before >= after:
|
57 |
+
raise ValueError("The 'before' date must be greater than the 'after' date.")
|
58 |
+
return before, after
|
59 |
+
|
60 |
+
def __init__(self, **data):
|
61 |
+
super().__init__(**data)
|
62 |
+
self.before, self.after = self.validate_before_after(self.before, self.after)
|
63 |
max_results: Optional[int] = 10
|
64 |
|
65 |
class ReqData(BaseModel):
|
|
|
84 |
MailReqData is a data model representing the structure of a mail request.
|
85 |
|
86 |
Attributes:
|
|
|
87 |
query (str): The query or message content sent by the user.
|
88 |
"""
|
|
|
89 |
query: EmailQuery
|
90 |
|
91 |
class ReqFollowUp(BaseModel):
|
app/services/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Module for database operations."""
|
2 |
+
from services.gmail import GmailService
|
3 |
+
|
4 |
+
__all__ = [
|
5 |
+
'GmailService'
|
6 |
+
]
|
app/services/gmail.py
ADDED
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module provides a utility class, `GmailService`, for interacting with the Gmail API.
|
3 |
+
"""
|
4 |
+
import base64
|
5 |
+
import hashlib
|
6 |
+
import os
|
7 |
+
import re
|
8 |
+
from datetime import datetime
|
9 |
+
from venv import logger
|
10 |
+
|
11 |
+
from google.oauth2.credentials import Credentials
|
12 |
+
from googleapiclient.discovery import build
|
13 |
+
from ics import Calendar
|
14 |
+
from langchain_community.document_loaders import (
|
15 |
+
CSVLoader,
|
16 |
+
PyPDFLoader,
|
17 |
+
UnstructuredExcelLoader,
|
18 |
+
UnstructuredImageLoader,
|
19 |
+
)
|
20 |
+
from langchain_core.documents import Document
|
21 |
+
from models.db import vectorstore
|
22 |
+
|
23 |
+
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
|
24 |
+
EMAIL_PATTERN = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
|
25 |
+
|
26 |
+
ATTACHMENTS_DIR = "cache"
|
27 |
+
os.makedirs(ATTACHMENTS_DIR, exist_ok=True)
|
28 |
+
|
29 |
+
class GmailService():
|
30 |
+
"""
|
31 |
+
GmailService is a utility class for interacting with the Gmail API. It provides methods to
|
32 |
+
construct query strings, search for emails, and retrieve detailed email information.
|
33 |
+
Methods:
|
34 |
+
__init__(token):
|
35 |
+
Initializes the GmailService instance with an authenticated Gmail API service.
|
36 |
+
build_query(params):
|
37 |
+
Constructs a query string based on the provided parameters for filtering emails.
|
38 |
+
search(query, max_results=10, check_next_page=False):
|
39 |
+
Searches for emails based on a query string and returns a list of message metadata.
|
40 |
+
get_emails(query, max_results=10):
|
41 |
+
Retrieves a list of emails with detailed information such as subject, sender,
|
42 |
+
recipients, and content.
|
43 |
+
Attributes:
|
44 |
+
service:
|
45 |
+
An authenticated Gmail API service instance used to interact with the Gmail API.
|
46 |
+
"""
|
47 |
+
def __init__(self, token):
|
48 |
+
"""
|
49 |
+
Initializes the Gmail controller with the provided token.
|
50 |
+
|
51 |
+
Args:
|
52 |
+
token (str): The auth token used to create credentials for accessing the Gmail API.
|
53 |
+
"""
|
54 |
+
self.service = build("gmail", "v1", credentials=Credentials(token=token))
|
55 |
+
|
56 |
+
def parse_query(self, params) -> str:
|
57 |
+
"""
|
58 |
+
Constructs a query string based on the provided parameters.
|
59 |
+
|
60 |
+
Args:
|
61 |
+
params (dict): A dictionary containing optional query parameters.
|
62 |
+
Supported keys include:
|
63 |
+
- 'subject' (str): The subject of the email.
|
64 |
+
- 'from' (str): The sender's email address.
|
65 |
+
- 'to' (str): The recipient's email address.
|
66 |
+
- 'cc' (str): The CC recipient's email address.
|
67 |
+
- 'after' (str): A date string to filter emails sent after this date.
|
68 |
+
- 'before' (str): A date string to filter emails sent before this date.
|
69 |
+
|
70 |
+
Returns:
|
71 |
+
str: A query string constructed from the provided parameters. Each parameter
|
72 |
+
is formatted as a key-value pair and joined by spaces. If a parameter is not
|
73 |
+
provided or is empty, it is excluded from the query string.
|
74 |
+
"""
|
75 |
+
query_parts = []
|
76 |
+
if 'subject' in params and params['subject']:
|
77 |
+
query_parts.append(f'subject:({params["subject"]})')
|
78 |
+
if 'from_email' in params and params['from_email']:
|
79 |
+
query_parts.append(f'from:({params["from_email"]})')
|
80 |
+
if 'to_email' in params and params['to_email']:
|
81 |
+
query_parts.append(f'to:({params["to_email"]})')
|
82 |
+
if 'cc_email' in params and params['cc_email']:
|
83 |
+
query_parts.append(f'cc:({params["cc_email"]})')
|
84 |
+
if 'after' in params and params['after']:
|
85 |
+
query_parts.append(f'after:{params["after"]}')
|
86 |
+
if 'before' in params and params['before']:
|
87 |
+
query_parts.append(f'before:{params["before"]}')
|
88 |
+
return ' '.join(query_parts)
|
89 |
+
|
90 |
+
def collect(self, query):
|
91 |
+
"""
|
92 |
+
Main function to search and list emails from Gmail.
|
93 |
+
|
94 |
+
This function builds a Gmail service, constructs a query to search for emails
|
95 |
+
received in the last 14 days, and lists the found emails. If no emails are found,
|
96 |
+
it prints a message indicating so.
|
97 |
+
|
98 |
+
Returns:
|
99 |
+
None
|
100 |
+
"""
|
101 |
+
ids = []
|
102 |
+
documents = []
|
103 |
+
for message in self.search(query):
|
104 |
+
msg = self.service.users().messages().get(
|
105 |
+
userId="me", id=message["id"], format="full").execute()
|
106 |
+
metadata = {}
|
107 |
+
metadata["threadId"] = msg["threadId"]
|
108 |
+
metadata["msgId"] = msg["id"]
|
109 |
+
msg_id = f"{msg['threadId']}-{msg['id']}"
|
110 |
+
for header in msg["payload"]["headers"]:
|
111 |
+
if header["name"] == "From":
|
112 |
+
metadata["from"] = header["value"]
|
113 |
+
elif header["name"] == "To":
|
114 |
+
metadata["to"] = header["value"]
|
115 |
+
elif header["name"] == "Subject":
|
116 |
+
metadata["subject"] = header["value"]
|
117 |
+
logger.info("subject: %s", metadata["subject"])
|
118 |
+
elif header["name"] == "Cc":
|
119 |
+
metadata["cc"] = header["value"]
|
120 |
+
metadata["date"] = datetime.fromtimestamp(int(msg["internalDate"]) / 1000).strftime(
|
121 |
+
"%d/%m/%Y %H:%M:%S"
|
122 |
+
)
|
123 |
+
metadata["userId"] = self.service.users().getProfile(
|
124 |
+
userId="me").execute().get("emailAddress")
|
125 |
+
ids = []
|
126 |
+
documents = []
|
127 |
+
mime_types = []
|
128 |
+
if msg["payload"]["mimeType"] in [
|
129 |
+
"multipart/alternative",
|
130 |
+
"multipart/related",
|
131 |
+
"multipart/mixed",
|
132 |
+
]:
|
133 |
+
mime_types = []
|
134 |
+
attach_docs = []
|
135 |
+
for part in msg["payload"]["parts"]:
|
136 |
+
mime_types.append(part["mimeType"])
|
137 |
+
if part["mimeType"] == "text/plain" and "text/html" not in mime_types:
|
138 |
+
body = base64.urlsafe_b64decode(part["body"]["data"]).decode("utf-8")
|
139 |
+
body = re.sub(r"<[^>]+>", "", body) # Remove HTML tags
|
140 |
+
metadata["mimeType"] = part["mimeType"]
|
141 |
+
documents.append(Document(page_content=body, metadata=metadata))
|
142 |
+
ids.append(msg["id"])
|
143 |
+
elif part["mimeType"] == "text/html" and "text/plain" not in mime_types:
|
144 |
+
body = base64.urlsafe_b64decode(part["body"]["data"]).decode("utf-8")
|
145 |
+
body = re.sub(r"<[^>]+>", "", body)
|
146 |
+
metadata["mimeType"] = part["mimeType"]
|
147 |
+
documents.append(Document(page_content=body, metadata=metadata))
|
148 |
+
ids.append(msg["id"])
|
149 |
+
if part["filename"]:
|
150 |
+
attachment_id = part["body"]["attachmentId"]
|
151 |
+
logger.info("Downloading attachment: %s", part["filename"])
|
152 |
+
attachment = (
|
153 |
+
self.service.users()
|
154 |
+
.messages()
|
155 |
+
.attachments()
|
156 |
+
.get(userId="me", messageId=message["id"], id=attachment_id)
|
157 |
+
.execute()
|
158 |
+
)
|
159 |
+
file_data = base64.urlsafe_b64decode(attachment["data"].encode("UTF-8"))
|
160 |
+
path = os.path.join(".", ATTACHMENTS_DIR, part["filename"])
|
161 |
+
with open(path, "wb") as f:
|
162 |
+
f.write(file_data)
|
163 |
+
if part["mimeType"] == "application/pdf":
|
164 |
+
attach_docs = PyPDFLoader(path).load()
|
165 |
+
elif part["mimeType"] == "image/png" or part["mimeType"] == "image/jpeg":
|
166 |
+
try:
|
167 |
+
attach_docs = UnstructuredImageLoader(path).load()
|
168 |
+
except ValueError as e: # Replace with the specific exception type
|
169 |
+
logger.error("Error loading image: %s", e)
|
170 |
+
elif part["filename"].endswith(".csv"):
|
171 |
+
attach_docs = CSVLoader(path).load()
|
172 |
+
elif (
|
173 |
+
part["mimeType"]
|
174 |
+
== "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
175 |
+
):
|
176 |
+
attach_docs = UnstructuredExcelLoader(path).load()
|
177 |
+
elif part["mimeType"] == "application/ics":
|
178 |
+
with open(path, "r", encoding="utf-8") as f:
|
179 |
+
calendar = Calendar(f.read())
|
180 |
+
for event in calendar.events:
|
181 |
+
documents.append(
|
182 |
+
Document(
|
183 |
+
page_content=f"Event: {event.name}\n\Description: {event.description}\nStart: {event.begin}\nEnd: {event.end}",
|
184 |
+
metadata={
|
185 |
+
"attachment": part["filename"],
|
186 |
+
"mimeType": part["mimeType"],
|
187 |
+
"location": event.location,
|
188 |
+
"created": event.created.strftime("%d/%m/%Y %H:%M:%S"),
|
189 |
+
"last_modified": event.last_modified.strftime(
|
190 |
+
"%d/%m/%Y %H:%M:%S"
|
191 |
+
),
|
192 |
+
"start": event.begin.strftime("%d/%m/%Y %H:%M:%S"),
|
193 |
+
"end": event.end.strftime("%d/%m/%Y %H:%M:%S"),
|
194 |
+
},
|
195 |
+
)
|
196 |
+
)
|
197 |
+
ids.append(
|
198 |
+
f"{msg_id}-{part['filename']}-{hashlib.sha256(file_data).hexdigest()}")
|
199 |
+
if os.path.exists(path):
|
200 |
+
os.remove(path)
|
201 |
+
for index, document in enumerate(attach_docs or []):
|
202 |
+
document.metadata["mimeType"] = part["mimeType"]
|
203 |
+
if "page_label" in document.metadata:
|
204 |
+
document.metadata["page"] = document.metadata["page_label"]
|
205 |
+
document.metadata["attachment"] = part["filename"]
|
206 |
+
document.metadata = {
|
207 |
+
key: value
|
208 |
+
for key, value in document.metadata.items()
|
209 |
+
if key in ["attachment", "page"]
|
210 |
+
}
|
211 |
+
document.metadata.update(metadata)
|
212 |
+
documents.append(document)
|
213 |
+
ids.append(f"{msg_id}-{hashlib.sha256(file_data).hexdigest()}-{index}")
|
214 |
+
elif msg["payload"]["mimeType"] == "text/plain" and "data" in msg["payload"]["body"]:
|
215 |
+
body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
|
216 |
+
body = re.sub(r"<[^>]+>", "", body)
|
217 |
+
metadata["mimeType"] = msg["payload"]["mimeType"]
|
218 |
+
documents.append(Document(page_content=body, metadata=metadata))
|
219 |
+
ids.append(msg_id)
|
220 |
+
elif msg["payload"]["mimeType"] == "text/html" and "data" in msg["payload"]["body"]:
|
221 |
+
body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
|
222 |
+
body = re.sub(r"<[^>]+>", "", body)
|
223 |
+
metadata["mimeType"] = msg["payload"]["mimeType"]
|
224 |
+
documents.append(Document(page_content=body, metadata=metadata))
|
225 |
+
ids.append(msg_id)
|
226 |
+
if "multipart/alternative" in mime_types and len(mime_types) == 1:
|
227 |
+
logger.info("Only multipart/alternative found in the email.")
|
228 |
+
else:
|
229 |
+
try:
|
230 |
+
vectorstore.add_documents(documents=documents, ids=ids)
|
231 |
+
except ValueError as e:
|
232 |
+
logger.error("Error adding documents to vectorstore: %s", e)
|
233 |
+
|
234 |
+
def search(self, query, max_results=10, check_next_page=False) -> list:
|
235 |
+
"""
|
236 |
+
Searches for Gmail messages based on a query string.
|
237 |
+
|
238 |
+
Args:
|
239 |
+
query (str): The search query string to filter messages.
|
240 |
+
max_results (int, optional): The maximum number of results to retrieve per page.
|
241 |
+
check_next_page (bool, optional): if to fetch additional pages of results if available.
|
242 |
+
|
243 |
+
Returns:
|
244 |
+
list: A list of message metadata dict. Each dictionary contains info about a message.
|
245 |
+
|
246 |
+
Notes:
|
247 |
+
- The `query` parameter supports Gmail's advanced search operators.
|
248 |
+
- If `check_next_page` is True, will continue fetching messages until all are retrieved.
|
249 |
+
"""
|
250 |
+
query = self.parse_query(query.dict())
|
251 |
+
result = self.service.users().messages().list(
|
252 |
+
userId='me', q=query, maxResults=max_results).execute()
|
253 |
+
messages = []
|
254 |
+
if "messages" in result:
|
255 |
+
messages.extend(result["messages"])
|
256 |
+
while "nextPageToken" in result and check_next_page:
|
257 |
+
page_token = result["nextPageToken"]
|
258 |
+
result = (
|
259 |
+
self.service.users().messages().list(
|
260 |
+
userId="me", q=query, maxResults=max_results, pageToken=page_token).execute()
|
261 |
+
)
|
262 |
+
if "messages" in result:
|
263 |
+
messages.extend(result["messages"])
|
264 |
+
return messages
|
265 |
+
|
266 |
+
def get(self, query, max_results=10) -> list:
|
267 |
+
"""
|
268 |
+
Retrieve a list of emails with subject, to, from, cc, and content.
|
269 |
+
|
270 |
+
Args:
|
271 |
+
mailservice: Authenticated Gmail API service instance
|
272 |
+
max_results: Maximum number of emails to retrieve
|
273 |
+
|
274 |
+
Returns:
|
275 |
+
List of dictionaries containing email details
|
276 |
+
"""
|
277 |
+
try:
|
278 |
+
messages = self.search(query, max_results)
|
279 |
+
email_list = []
|
280 |
+
if not messages:
|
281 |
+
return email_list
|
282 |
+
for message in messages:
|
283 |
+
msg = self.service.users().messages().get(
|
284 |
+
userId='me', id=message['id'], format='full').execute()
|
285 |
+
headers = msg['payload']['headers']
|
286 |
+
email_data = {
|
287 |
+
'subject': '',
|
288 |
+
'from': '',
|
289 |
+
'to': '',
|
290 |
+
'cc': '',
|
291 |
+
'content': '',
|
292 |
+
'snippet': msg['snippet'] if 'snippet' in msg else '',
|
293 |
+
}
|
294 |
+
for header in headers:
|
295 |
+
name = header['name'].lower()
|
296 |
+
if name == 'subject':
|
297 |
+
email_data['subject'] = header['value']
|
298 |
+
elif name == 'from':
|
299 |
+
email_data['from'] = header['value']
|
300 |
+
elif name == 'to':
|
301 |
+
email_data['to'] = header['value']
|
302 |
+
elif name == 'cc':
|
303 |
+
email_data['cc'] = header['value']
|
304 |
+
if 'parts' in msg['payload']:
|
305 |
+
for part in msg['payload']['parts']:
|
306 |
+
if part['mimeType'] == 'text/plain':
|
307 |
+
email_data['content'] = base64.urlsafe_b64decode(
|
308 |
+
part['body']['data']).decode('utf-8')
|
309 |
+
break
|
310 |
+
elif part['mimeType'] == 'text/html':
|
311 |
+
email_data['content'] = base64.urlsafe_b64decode(
|
312 |
+
part['body']['data']).decode('utf-8')
|
313 |
+
break
|
314 |
+
elif 'data' in msg['payload']['body']:
|
315 |
+
email_data['content'] = base64.urlsafe_b64decode(
|
316 |
+
msg['payload']['body']['data']).decode('utf-8')
|
317 |
+
email_list.append(email_data)
|
318 |
+
return email_list
|
319 |
+
|
320 |
+
except (KeyError, ValueError, TypeError) as e:
|
321 |
+
logger.info("An error occurred: %s", e)
|
322 |
+
return []
|