Refactor API routing to integrate new mail and content modules; remove obsolete main router
Browse files- controllers/mail.py +69 -38
- main.py +3 -2
- retriever/__init__.py +1 -2
- router/{main.py → content.py} +13 -0
- router/mail.py +34 -0
- token.pickle +0 -0
controllers/mail.py
CHANGED
@@ -39,7 +39,37 @@ def search_emails(query):
|
|
39 |
return messages
|
40 |
|
41 |
def list_emails(messages):
|
42 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
ids = []
|
44 |
documents = []
|
45 |
for message in messages:
|
@@ -57,30 +87,29 @@ def list_emails(messages):
|
|
57 |
metadata['date'] = datetime.fromtimestamp(
|
58 |
int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
|
59 |
metadata['msg_id'] = msg['id']
|
60 |
-
print(metadata)
|
61 |
-
print(msg['payload']['mimeType'])
|
62 |
-
# body = ""
|
63 |
ids = []
|
64 |
documents = []
|
|
|
65 |
if msg['payload']['mimeType'] in ['multipart/alternative', 'multipart/related', 'multipart/mixed']:
|
66 |
-
|
67 |
-
|
68 |
for part in msg['payload']['parts']:
|
69 |
-
print("
|
70 |
-
|
71 |
-
if part['mimeType'] == 'text/plain' and 'text/html' not in
|
72 |
body = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
|
73 |
body = re.sub(r'<[^>]+>', '', body) # Remove HTML tags
|
74 |
-
metadata['
|
75 |
documents.append(Document(
|
76 |
page_content=body,
|
77 |
metadata=metadata
|
78 |
))
|
79 |
ids.append(msg['id'])
|
80 |
-
elif part['mimeType'] == 'text/html' and 'text/plain' not in
|
81 |
body = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
|
82 |
body = re.sub(r'<[^>]+>', '', body)
|
83 |
-
metadata['
|
84 |
documents.append(Document(
|
85 |
page_content=body,
|
86 |
metadata=metadata
|
@@ -96,24 +125,22 @@ def list_emails(messages):
|
|
96 |
with open(path, 'wb') as f:
|
97 |
f.write(file_data)
|
98 |
if part['mimeType'] == 'application/pdf':
|
99 |
-
# attach_docs = attach_docs + PyPDFLoader(path).load()
|
100 |
attach_docs = PyPDFLoader(path).load()
|
101 |
elif part['mimeType'] == 'image/png' or part['mimeType'] == 'image/jpeg':
|
102 |
-
# attach_docs = attach_docs + UnstructuredImageLoader(path).load()
|
103 |
attach_docs = UnstructuredImageLoader(path).load()
|
104 |
elif part['filename'].endswith('.csv'):
|
105 |
-
# attach_docs = attach_docs + CSVLoader(path).load()
|
106 |
attach_docs = CSVLoader(path).load()
|
107 |
elif part['mimeType'] == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
|
108 |
-
# attach_docs = attach_docs + UnstructuredExcelLoader(path).load()
|
109 |
attach_docs = UnstructuredExcelLoader(path).load()
|
110 |
elif part['mimeType'] == 'application/ics':
|
111 |
with open(path, 'r', encoding='utf-8') as f:
|
112 |
calendar = Calendar(f.read())
|
113 |
for event in calendar.events:
|
114 |
-
|
115 |
page_content = f"Event: {event.name}\nDescription: {event.description}\nStart: {event.begin}\nEnd: {event.end}",
|
116 |
metadata = {
|
|
|
|
|
117 |
"location": event.location,
|
118 |
"created": event.created.strftime("%d/%m/%Y %H:%M:%S"),
|
119 |
"last_modified": event.last_modified.strftime("%d/%m/%Y %H:%M:%S"),
|
@@ -121,34 +148,22 @@ def list_emails(messages):
|
|
121 |
"end": event.end.strftime("%d/%m/%Y %H:%M:%S")
|
122 |
}
|
123 |
))
|
|
|
124 |
if os.path.exists(path):
|
125 |
os.remove(path)
|
126 |
-
for index, document in enumerate(attach_docs):
|
127 |
-
|
128 |
-
print(document.metadata)
|
129 |
-
document.metadata['minetype'] = part['mimeType']
|
130 |
if 'page_label' in document.metadata:
|
131 |
document.metadata['page'] = document.metadata['page_label']
|
132 |
document.metadata['attachment'] = part['filename']
|
133 |
-
for key in
|
134 |
-
document.metadata.pop(key, None)
|
135 |
document.metadata.update(metadata)
|
136 |
-
print(document.metadata)
|
137 |
-
print("-"*100)
|
138 |
documents.append(document)
|
139 |
-
ids.append(
|
140 |
-
# for index, document in enumerate(attach_docs):
|
141 |
-
# _id = f"{msg['id']}_{index}"
|
142 |
-
# # if 'source' in document.metadata:
|
143 |
-
# # document.metadata['source'] = document.metadata['source'].replace(f"./{ATTACHMENTS_DIR}/", "")
|
144 |
-
# # document.metadata['minetype'] = part['mimeType']
|
145 |
-
# document.metadata.update(metadata)
|
146 |
-
# documents.append(document)
|
147 |
-
# ids.append(_id)
|
148 |
elif msg['payload']['mimeType'] == 'text/plain' and 'data' in msg['payload']['body']:
|
149 |
body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
|
150 |
body = re.sub(r'<[^>]+>', '', body)
|
151 |
-
metadata['
|
152 |
documents.append(Document(
|
153 |
page_content=body,
|
154 |
metadata=metadata
|
@@ -157,16 +172,15 @@ def list_emails(messages):
|
|
157 |
elif msg['payload']['mimeType'] == 'text/html' and 'data' in msg['payload']['body']:
|
158 |
body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
|
159 |
body = re.sub(r'<[^>]+>', '', body)
|
160 |
-
metadata['
|
161 |
documents.append(Document(
|
162 |
page_content=body,
|
163 |
metadata=metadata
|
164 |
))
|
165 |
ids.append(msg['id'])
|
166 |
-
if 'multipart/alternative' in
|
167 |
print("Only multipart/alternative found in the email.")
|
168 |
else:
|
169 |
-
print(documents)
|
170 |
vectorstore.add_documents(documents=documents, ids=ids)
|
171 |
|
172 |
def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%m/%d')):
|
@@ -210,3 +224,20 @@ def get_documents():
|
|
210 |
df = pd.concat(
|
211 |
[df.drop('metadatas', axis=1), df['metadatas'].apply(pd.Series)],
|
212 |
axis=1).to_excel('collection_data_expand.xlsx', index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
return messages
|
40 |
|
41 |
def list_emails(messages):
|
42 |
+
"""
|
43 |
+
Processes a list of email messages, extracts metadata, decodes content, and handles attachments.
|
44 |
+
|
45 |
+
Args:
|
46 |
+
messages (list): A list of email message dictionaries, where each dictionary contains
|
47 |
+
at least an 'id' key representing the email's unique identifier.
|
48 |
+
|
49 |
+
Returns:
|
50 |
+
None: The function processes the emails and adds the extracted documents to a vector store.
|
51 |
+
|
52 |
+
Functionality:
|
53 |
+
- Retrieves email details using the Gmail API.
|
54 |
+
- Extracts metadata such as sender, recipient, subject, CC, and date.
|
55 |
+
- Decodes email content in plain text or HTML format.
|
56 |
+
- Handles multipart emails, including attachments.
|
57 |
+
- Processes attachments based on their MIME type:
|
58 |
+
- PDF files are loaded using PyPDFLoader.
|
59 |
+
- Images (PNG, JPEG) are loaded using UnstructuredImageLoader.
|
60 |
+
- CSV files are loaded using CSVLoader.
|
61 |
+
- Excel files are loaded using UnstructuredExcelLoader.
|
62 |
+
- Calendar files (ICS) are parsed to extract event details.
|
63 |
+
- Removes HTML tags from email content.
|
64 |
+
- Stores processed documents and metadata in a vector store.
|
65 |
+
- Deletes temporary files created during attachment processing.
|
66 |
+
|
67 |
+
Notes:
|
68 |
+
- The function assumes the existence of a global `service` object for Gmail API interactions.
|
69 |
+
- The `vectorstore.add_documents` method is used to store the processed documents.
|
70 |
+
- Attachments are temporarily saved in a directory specified by `ATTACHMENTS_DIR` and deleted after processing.
|
71 |
+
- The function logs information about attachments being downloaded.
|
72 |
+
"""
|
73 |
ids = []
|
74 |
documents = []
|
75 |
for message in messages:
|
|
|
87 |
metadata['date'] = datetime.fromtimestamp(
|
88 |
int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
|
89 |
metadata['msg_id'] = msg['id']
|
90 |
+
print(metadata, msg['payload']['mimeType'])
|
|
|
|
|
91 |
ids = []
|
92 |
documents = []
|
93 |
+
mimeType = []
|
94 |
if msg['payload']['mimeType'] in ['multipart/alternative', 'multipart/related', 'multipart/mixed']:
|
95 |
+
mimeType = []
|
96 |
+
attach_docs = []
|
97 |
for part in msg['payload']['parts']:
|
98 |
+
print("mimeType: ", part['mimeType'])
|
99 |
+
mimeType.append(part['mimeType'])
|
100 |
+
if part['mimeType'] == 'text/plain' and 'text/html' not in mimeType:
|
101 |
body = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
|
102 |
body = re.sub(r'<[^>]+>', '', body) # Remove HTML tags
|
103 |
+
metadata['mimeType'] = part['mimeType']
|
104 |
documents.append(Document(
|
105 |
page_content=body,
|
106 |
metadata=metadata
|
107 |
))
|
108 |
ids.append(msg['id'])
|
109 |
+
elif part['mimeType'] == 'text/html' and 'text/plain' not in mimeType:
|
110 |
body = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
|
111 |
body = re.sub(r'<[^>]+>', '', body)
|
112 |
+
metadata['mimeType'] = part['mimeType']
|
113 |
documents.append(Document(
|
114 |
page_content=body,
|
115 |
metadata=metadata
|
|
|
125 |
with open(path, 'wb') as f:
|
126 |
f.write(file_data)
|
127 |
if part['mimeType'] == 'application/pdf':
|
|
|
128 |
attach_docs = PyPDFLoader(path).load()
|
129 |
elif part['mimeType'] == 'image/png' or part['mimeType'] == 'image/jpeg':
|
|
|
130 |
attach_docs = UnstructuredImageLoader(path).load()
|
131 |
elif part['filename'].endswith('.csv'):
|
|
|
132 |
attach_docs = CSVLoader(path).load()
|
133 |
elif part['mimeType'] == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
|
|
|
134 |
attach_docs = UnstructuredExcelLoader(path).load()
|
135 |
elif part['mimeType'] == 'application/ics':
|
136 |
with open(path, 'r', encoding='utf-8') as f:
|
137 |
calendar = Calendar(f.read())
|
138 |
for event in calendar.events:
|
139 |
+
documents.append(Document(
|
140 |
page_content = f"Event: {event.name}\nDescription: {event.description}\nStart: {event.begin}\nEnd: {event.end}",
|
141 |
metadata = {
|
142 |
+
"attachment": part['filename'],
|
143 |
+
"mimeType": part['mimeType'],
|
144 |
"location": event.location,
|
145 |
"created": event.created.strftime("%d/%m/%Y %H:%M:%S"),
|
146 |
"last_modified": event.last_modified.strftime("%d/%m/%Y %H:%M:%S"),
|
|
|
148 |
"end": event.end.strftime("%d/%m/%Y %H:%M:%S")
|
149 |
}
|
150 |
))
|
151 |
+
ids.append(f"{msg['id']}_{attachment_id}")
|
152 |
if os.path.exists(path):
|
153 |
os.remove(path)
|
154 |
+
for index, document in enumerate(attach_docs or []):
|
155 |
+
document.metadata['mimeType'] = part['mimeType']
|
|
|
|
|
156 |
if 'page_label' in document.metadata:
|
157 |
document.metadata['page'] = document.metadata['page_label']
|
158 |
document.metadata['attachment'] = part['filename']
|
159 |
+
document.metadata = {key: value for key, value in document.metadata.items() if key in ['attachment', 'page']}
|
|
|
160 |
document.metadata.update(metadata)
|
|
|
|
|
161 |
documents.append(document)
|
162 |
+
ids.append(f"{msg['id']}_{attachment_id}_{index}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
elif msg['payload']['mimeType'] == 'text/plain' and 'data' in msg['payload']['body']:
|
164 |
body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
|
165 |
body = re.sub(r'<[^>]+>', '', body)
|
166 |
+
metadata['mimeType'] = msg['payload']['mimeType']
|
167 |
documents.append(Document(
|
168 |
page_content=body,
|
169 |
metadata=metadata
|
|
|
172 |
elif msg['payload']['mimeType'] == 'text/html' and 'data' in msg['payload']['body']:
|
173 |
body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
|
174 |
body = re.sub(r'<[^>]+>', '', body)
|
175 |
+
metadata['mimeType'] = msg['payload']['mimeType']
|
176 |
documents.append(Document(
|
177 |
page_content=body,
|
178 |
metadata=metadata
|
179 |
))
|
180 |
ids.append(msg['id'])
|
181 |
+
if 'multipart/alternative' in mimeType and len(mimeType) == 1:
|
182 |
print("Only multipart/alternative found in the email.")
|
183 |
else:
|
|
|
184 |
vectorstore.add_documents(documents=documents, ids=ids)
|
185 |
|
186 |
def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%m/%d')):
|
|
|
224 |
df = pd.concat(
|
225 |
[df.drop('metadatas', axis=1), df['metadatas'].apply(pd.Series)],
|
226 |
axis=1).to_excel('collection_data_expand.xlsx', index=False)
|
227 |
+
|
228 |
+
def get():
|
229 |
+
"""
|
230 |
+
Main function to list emails from the database.
|
231 |
+
|
232 |
+
This function lists all emails stored in the database.
|
233 |
+
|
234 |
+
Returns:
|
235 |
+
None
|
236 |
+
"""
|
237 |
+
data = vectorstore.get()
|
238 |
+
df = pd.DataFrame({
|
239 |
+
'id': data['ids'],
|
240 |
+
'documents': data['documents'],
|
241 |
+
'metadatas': data['metadatas']
|
242 |
+
})
|
243 |
+
return df.to_dict(orient='records')
|
main.py
CHANGED
@@ -2,12 +2,13 @@
|
|
2 |
import logging
|
3 |
from fastapi import FastAPI
|
4 |
from fastapi.middleware.cors import CORSMiddleware
|
5 |
-
from router import
|
6 |
|
7 |
|
8 |
app = FastAPI(docs_url="/")
|
9 |
|
10 |
-
app.include_router(
|
|
|
11 |
|
12 |
origins = [
|
13 |
"*"
|
|
|
2 |
import logging
|
3 |
from fastapi import FastAPI
|
4 |
from fastapi.middleware.cors import CORSMiddleware
|
5 |
+
from router import content, mail
|
6 |
|
7 |
|
8 |
app = FastAPI(docs_url="/")
|
9 |
|
10 |
+
app.include_router(content.router, tags=["content"])
|
11 |
+
app.include_router(mail.router, tags=["mail"])
|
12 |
|
13 |
origins = [
|
14 |
"*"
|
retriever/__init__.py
CHANGED
@@ -1,9 +1,7 @@
|
|
1 |
"""Module for retrievers that fetch documents from various sources."""
|
2 |
-
from importlib import metadata
|
3 |
from venv import logger
|
4 |
from langchain_core.retrievers import BaseRetriever
|
5 |
from langchain_core.vectorstores import VectorStoreRetriever
|
6 |
-
from langchain_core.documents import Document
|
7 |
from models.chroma import vectorstore
|
8 |
|
9 |
class DocRetriever(BaseRetriever):
|
@@ -47,6 +45,7 @@ class DocRetriever(BaseRetriever):
|
|
47 |
retrieved_docs = self.retriever.invoke(query)
|
48 |
# doc_lst = []
|
49 |
for doc in retrieved_docs:
|
|
|
50 |
# date = str(doc.metadata['publishDate'])
|
51 |
doc.metadata['content'] = doc.page_content
|
52 |
# doc_lst.append(Document(
|
|
|
1 |
"""Module for retrievers that fetch documents from various sources."""
|
|
|
2 |
from venv import logger
|
3 |
from langchain_core.retrievers import BaseRetriever
|
4 |
from langchain_core.vectorstores import VectorStoreRetriever
|
|
|
5 |
from models.chroma import vectorstore
|
6 |
|
7 |
class DocRetriever(BaseRetriever):
|
|
|
45 |
retrieved_docs = self.retriever.invoke(query)
|
46 |
# doc_lst = []
|
47 |
for doc in retrieved_docs:
|
48 |
+
doc.metadata['id'] = doc.id
|
49 |
# date = str(doc.metadata['publishDate'])
|
50 |
doc.metadata['content'] = doc.page_content
|
51 |
# doc_lst.append(Document(
|
router/{main.py → content.py}
RENAMED
@@ -19,6 +19,19 @@ async def stream(query: ReqData):
|
|
19 |
"""
|
20 |
return StreamingResponse(generate(query), media_type='text/event-stream')
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
# # @router.post("/followup")
|
23 |
# # def follow_up(req: ReqFollowUp):
|
24 |
# # """
|
|
|
19 |
"""
|
20 |
return StreamingResponse(generate(query), media_type='text/event-stream')
|
21 |
|
22 |
+
# @router.post("/list")
|
23 |
+
# def chat(query: ReqData):
|
24 |
+
# """
|
25 |
+
# Handles the chat POST request.
|
26 |
+
|
27 |
+
# Args:
|
28 |
+
# query (ReqData): The request data containing the query parameters.
|
29 |
+
|
30 |
+
# Returns:
|
31 |
+
# str: The generated response from the chat function.
|
32 |
+
# """
|
33 |
+
# return generate(query)
|
34 |
+
|
35 |
# # @router.post("/followup")
|
36 |
# # def follow_up(req: ReqFollowUp):
|
37 |
# # """
|
router/mail.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Module for defining the main routes of the API."""
|
2 |
+
from fastapi import APIRouter
|
3 |
+
from fastapi.responses import JSONResponse
|
4 |
+
from controllers import mail
|
5 |
+
|
6 |
+
router = APIRouter(prefix="/mail", tags=["Mail"])
|
7 |
+
|
8 |
+
@router.post("")
|
9 |
+
def collect():
|
10 |
+
"""
|
11 |
+
Handles the chat POST request.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
query (ReqData): The request data containing the query parameters.
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
str: The generated response from the chat function.
|
18 |
+
"""
|
19 |
+
mail.collect()
|
20 |
+
return JSONResponse(content={"message": "Mail collected successfully."})
|
21 |
+
|
22 |
+
@router.get("")
|
23 |
+
def get():
|
24 |
+
"""
|
25 |
+
Handles the chat POST request.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
query (ReqData): The request data containing the query parameters.
|
29 |
+
|
30 |
+
Returns:
|
31 |
+
str: The generated response from the chat function.
|
32 |
+
"""
|
33 |
+
result = mail.get()
|
34 |
+
return JSONResponse(content={"message": result})
|
token.pickle
CHANGED
Binary files a/token.pickle and b/token.pickle differ
|
|