gavinzli commited on
Commit
e3a12d5
·
1 Parent(s): 8470dde

Refactor API routing to integrate new mail and content modules; remove obsolete main router

Browse files
controllers/mail.py CHANGED
@@ -39,7 +39,37 @@ def search_emails(query):
39
  return messages
40
 
41
  def list_emails(messages):
42
- """List emails from the search results and download attachments."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  ids = []
44
  documents = []
45
  for message in messages:
@@ -57,30 +87,29 @@ def list_emails(messages):
57
  metadata['date'] = datetime.fromtimestamp(
58
  int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
59
  metadata['msg_id'] = msg['id']
60
- print(metadata)
61
- print(msg['payload']['mimeType'])
62
- # body = ""
63
  ids = []
64
  documents = []
 
65
  if msg['payload']['mimeType'] in ['multipart/alternative', 'multipart/related', 'multipart/mixed']:
66
- minetype = []
67
- # attach_docs = []
68
  for part in msg['payload']['parts']:
69
- print("minetype: ", part['mimeType'])
70
- minetype.append(part['mimeType'])
71
- if part['mimeType'] == 'text/plain' and 'text/html' not in minetype:
72
  body = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
73
  body = re.sub(r'<[^>]+>', '', body) # Remove HTML tags
74
- metadata['minetype'] = part['mimeType']
75
  documents.append(Document(
76
  page_content=body,
77
  metadata=metadata
78
  ))
79
  ids.append(msg['id'])
80
- elif part['mimeType'] == 'text/html' and 'text/plain' not in minetype:
81
  body = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
82
  body = re.sub(r'<[^>]+>', '', body)
83
- metadata['minetype'] = part['mimeType']
84
  documents.append(Document(
85
  page_content=body,
86
  metadata=metadata
@@ -96,24 +125,22 @@ def list_emails(messages):
96
  with open(path, 'wb') as f:
97
  f.write(file_data)
98
  if part['mimeType'] == 'application/pdf':
99
- # attach_docs = attach_docs + PyPDFLoader(path).load()
100
  attach_docs = PyPDFLoader(path).load()
101
  elif part['mimeType'] == 'image/png' or part['mimeType'] == 'image/jpeg':
102
- # attach_docs = attach_docs + UnstructuredImageLoader(path).load()
103
  attach_docs = UnstructuredImageLoader(path).load()
104
  elif part['filename'].endswith('.csv'):
105
- # attach_docs = attach_docs + CSVLoader(path).load()
106
  attach_docs = CSVLoader(path).load()
107
  elif part['mimeType'] == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
108
- # attach_docs = attach_docs + UnstructuredExcelLoader(path).load()
109
  attach_docs = UnstructuredExcelLoader(path).load()
110
  elif part['mimeType'] == 'application/ics':
111
  with open(path, 'r', encoding='utf-8') as f:
112
  calendar = Calendar(f.read())
113
  for event in calendar.events:
114
- attach_docs.append(Document(
115
  page_content = f"Event: {event.name}\nDescription: {event.description}\nStart: {event.begin}\nEnd: {event.end}",
116
  metadata = {
 
 
117
  "location": event.location,
118
  "created": event.created.strftime("%d/%m/%Y %H:%M:%S"),
119
  "last_modified": event.last_modified.strftime("%d/%m/%Y %H:%M:%S"),
@@ -121,34 +148,22 @@ def list_emails(messages):
121
  "end": event.end.strftime("%d/%m/%Y %H:%M:%S")
122
  }
123
  ))
 
124
  if os.path.exists(path):
125
  os.remove(path)
126
- for index, document in enumerate(attach_docs):
127
- _id = f"{msg['id']}_{attachment_id}_{index}"
128
- print(document.metadata)
129
- document.metadata['minetype'] = part['mimeType']
130
  if 'page_label' in document.metadata:
131
  document.metadata['page'] = document.metadata['page_label']
132
  document.metadata['attachment'] = part['filename']
133
- for key in ['creationdate', 'total_pages', 'creator', 'producer', 'moddate', 'page_label', 'source']:
134
- document.metadata.pop(key, None)
135
  document.metadata.update(metadata)
136
- print(document.metadata)
137
- print("-"*100)
138
  documents.append(document)
139
- ids.append(_id)
140
- # for index, document in enumerate(attach_docs):
141
- # _id = f"{msg['id']}_{index}"
142
- # # if 'source' in document.metadata:
143
- # # document.metadata['source'] = document.metadata['source'].replace(f"./{ATTACHMENTS_DIR}/", "")
144
- # # document.metadata['minetype'] = part['mimeType']
145
- # document.metadata.update(metadata)
146
- # documents.append(document)
147
- # ids.append(_id)
148
  elif msg['payload']['mimeType'] == 'text/plain' and 'data' in msg['payload']['body']:
149
  body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
150
  body = re.sub(r'<[^>]+>', '', body)
151
- metadata['minetype'] = msg['payload']['mimeType']
152
  documents.append(Document(
153
  page_content=body,
154
  metadata=metadata
@@ -157,16 +172,15 @@ def list_emails(messages):
157
  elif msg['payload']['mimeType'] == 'text/html' and 'data' in msg['payload']['body']:
158
  body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
159
  body = re.sub(r'<[^>]+>', '', body)
160
- metadata['minetype'] = msg['payload']['mimeType']
161
  documents.append(Document(
162
  page_content=body,
163
  metadata=metadata
164
  ))
165
  ids.append(msg['id'])
166
- if 'multipart/alternative' in minetype and len(minetype) == 1:
167
  print("Only multipart/alternative found in the email.")
168
  else:
169
- print(documents)
170
  vectorstore.add_documents(documents=documents, ids=ids)
171
 
172
  def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%m/%d')):
@@ -210,3 +224,20 @@ def get_documents():
210
  df = pd.concat(
211
  [df.drop('metadatas', axis=1), df['metadatas'].apply(pd.Series)],
212
  axis=1).to_excel('collection_data_expand.xlsx', index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  return messages
40
 
41
  def list_emails(messages):
42
+ """
43
+ Processes a list of email messages, extracts metadata, decodes content, and handles attachments.
44
+
45
+ Args:
46
+ messages (list): A list of email message dictionaries, where each dictionary contains
47
+ at least an 'id' key representing the email's unique identifier.
48
+
49
+ Returns:
50
+ None: The function processes the emails and adds the extracted documents to a vector store.
51
+
52
+ Functionality:
53
+ - Retrieves email details using the Gmail API.
54
+ - Extracts metadata such as sender, recipient, subject, CC, and date.
55
+ - Decodes email content in plain text or HTML format.
56
+ - Handles multipart emails, including attachments.
57
+ - Processes attachments based on their MIME type:
58
+ - PDF files are loaded using PyPDFLoader.
59
+ - Images (PNG, JPEG) are loaded using UnstructuredImageLoader.
60
+ - CSV files are loaded using CSVLoader.
61
+ - Excel files are loaded using UnstructuredExcelLoader.
62
+ - Calendar files (ICS) are parsed to extract event details.
63
+ - Removes HTML tags from email content.
64
+ - Stores processed documents and metadata in a vector store.
65
+ - Deletes temporary files created during attachment processing.
66
+
67
+ Notes:
68
+ - The function assumes the existence of a global `service` object for Gmail API interactions.
69
+ - The `vectorstore.add_documents` method is used to store the processed documents.
70
+ - Attachments are temporarily saved in a directory specified by `ATTACHMENTS_DIR` and deleted after processing.
71
+ - The function logs information about attachments being downloaded.
72
+ """
73
  ids = []
74
  documents = []
75
  for message in messages:
 
87
  metadata['date'] = datetime.fromtimestamp(
88
  int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
89
  metadata['msg_id'] = msg['id']
90
+ print(metadata, msg['payload']['mimeType'])
 
 
91
  ids = []
92
  documents = []
93
+ mimeType = []
94
  if msg['payload']['mimeType'] in ['multipart/alternative', 'multipart/related', 'multipart/mixed']:
95
+ mimeType = []
96
+ attach_docs = []
97
  for part in msg['payload']['parts']:
98
+ print("mimeType: ", part['mimeType'])
99
+ mimeType.append(part['mimeType'])
100
+ if part['mimeType'] == 'text/plain' and 'text/html' not in mimeType:
101
  body = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
102
  body = re.sub(r'<[^>]+>', '', body) # Remove HTML tags
103
+ metadata['mimeType'] = part['mimeType']
104
  documents.append(Document(
105
  page_content=body,
106
  metadata=metadata
107
  ))
108
  ids.append(msg['id'])
109
+ elif part['mimeType'] == 'text/html' and 'text/plain' not in mimeType:
110
  body = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
111
  body = re.sub(r'<[^>]+>', '', body)
112
+ metadata['mimeType'] = part['mimeType']
113
  documents.append(Document(
114
  page_content=body,
115
  metadata=metadata
 
125
  with open(path, 'wb') as f:
126
  f.write(file_data)
127
  if part['mimeType'] == 'application/pdf':
 
128
  attach_docs = PyPDFLoader(path).load()
129
  elif part['mimeType'] == 'image/png' or part['mimeType'] == 'image/jpeg':
 
130
  attach_docs = UnstructuredImageLoader(path).load()
131
  elif part['filename'].endswith('.csv'):
 
132
  attach_docs = CSVLoader(path).load()
133
  elif part['mimeType'] == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
 
134
  attach_docs = UnstructuredExcelLoader(path).load()
135
  elif part['mimeType'] == 'application/ics':
136
  with open(path, 'r', encoding='utf-8') as f:
137
  calendar = Calendar(f.read())
138
  for event in calendar.events:
139
+ documents.append(Document(
140
  page_content = f"Event: {event.name}\nDescription: {event.description}\nStart: {event.begin}\nEnd: {event.end}",
141
  metadata = {
142
+ "attachment": part['filename'],
143
+ "mimeType": part['mimeType'],
144
  "location": event.location,
145
  "created": event.created.strftime("%d/%m/%Y %H:%M:%S"),
146
  "last_modified": event.last_modified.strftime("%d/%m/%Y %H:%M:%S"),
 
148
  "end": event.end.strftime("%d/%m/%Y %H:%M:%S")
149
  }
150
  ))
151
+ ids.append(f"{msg['id']}_{attachment_id}")
152
  if os.path.exists(path):
153
  os.remove(path)
154
+ for index, document in enumerate(attach_docs or []):
155
+ document.metadata['mimeType'] = part['mimeType']
 
 
156
  if 'page_label' in document.metadata:
157
  document.metadata['page'] = document.metadata['page_label']
158
  document.metadata['attachment'] = part['filename']
159
+ document.metadata = {key: value for key, value in document.metadata.items() if key in ['attachment', 'page']}
 
160
  document.metadata.update(metadata)
 
 
161
  documents.append(document)
162
+ ids.append(f"{msg['id']}_{attachment_id}_{index}")
 
 
 
 
 
 
 
 
163
  elif msg['payload']['mimeType'] == 'text/plain' and 'data' in msg['payload']['body']:
164
  body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
165
  body = re.sub(r'<[^>]+>', '', body)
166
+ metadata['mimeType'] = msg['payload']['mimeType']
167
  documents.append(Document(
168
  page_content=body,
169
  metadata=metadata
 
172
  elif msg['payload']['mimeType'] == 'text/html' and 'data' in msg['payload']['body']:
173
  body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
174
  body = re.sub(r'<[^>]+>', '', body)
175
+ metadata['mimeType'] = msg['payload']['mimeType']
176
  documents.append(Document(
177
  page_content=body,
178
  metadata=metadata
179
  ))
180
  ids.append(msg['id'])
181
+ if 'multipart/alternative' in mimeType and len(mimeType) == 1:
182
  print("Only multipart/alternative found in the email.")
183
  else:
 
184
  vectorstore.add_documents(documents=documents, ids=ids)
185
 
186
  def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%m/%d')):
 
224
  df = pd.concat(
225
  [df.drop('metadatas', axis=1), df['metadatas'].apply(pd.Series)],
226
  axis=1).to_excel('collection_data_expand.xlsx', index=False)
227
+
228
+ def get():
229
+ """
230
+ Main function to list emails from the database.
231
+
232
+ This function lists all emails stored in the database.
233
+
234
+ Returns:
235
+ None
236
+ """
237
+ data = vectorstore.get()
238
+ df = pd.DataFrame({
239
+ 'id': data['ids'],
240
+ 'documents': data['documents'],
241
+ 'metadatas': data['metadatas']
242
+ })
243
+ return df.to_dict(orient='records')
main.py CHANGED
@@ -2,12 +2,13 @@
2
  import logging
3
  from fastapi import FastAPI
4
  from fastapi.middleware.cors import CORSMiddleware
5
- from router import main
6
 
7
 
8
  app = FastAPI(docs_url="/")
9
 
10
- app.include_router(main.router, tags=["content"])
 
11
 
12
  origins = [
13
  "*"
 
2
  import logging
3
  from fastapi import FastAPI
4
  from fastapi.middleware.cors import CORSMiddleware
5
+ from router import content, mail
6
 
7
 
8
  app = FastAPI(docs_url="/")
9
 
10
+ app.include_router(content.router, tags=["content"])
11
+ app.include_router(mail.router, tags=["mail"])
12
 
13
  origins = [
14
  "*"
retriever/__init__.py CHANGED
@@ -1,9 +1,7 @@
1
  """Module for retrievers that fetch documents from various sources."""
2
- from importlib import metadata
3
  from venv import logger
4
  from langchain_core.retrievers import BaseRetriever
5
  from langchain_core.vectorstores import VectorStoreRetriever
6
- from langchain_core.documents import Document
7
  from models.chroma import vectorstore
8
 
9
  class DocRetriever(BaseRetriever):
@@ -47,6 +45,7 @@ class DocRetriever(BaseRetriever):
47
  retrieved_docs = self.retriever.invoke(query)
48
  # doc_lst = []
49
  for doc in retrieved_docs:
 
50
  # date = str(doc.metadata['publishDate'])
51
  doc.metadata['content'] = doc.page_content
52
  # doc_lst.append(Document(
 
1
  """Module for retrievers that fetch documents from various sources."""
 
2
  from venv import logger
3
  from langchain_core.retrievers import BaseRetriever
4
  from langchain_core.vectorstores import VectorStoreRetriever
 
5
  from models.chroma import vectorstore
6
 
7
  class DocRetriever(BaseRetriever):
 
45
  retrieved_docs = self.retriever.invoke(query)
46
  # doc_lst = []
47
  for doc in retrieved_docs:
48
+ doc.metadata['id'] = doc.id
49
  # date = str(doc.metadata['publishDate'])
50
  doc.metadata['content'] = doc.page_content
51
  # doc_lst.append(Document(
router/{main.py → content.py} RENAMED
@@ -19,6 +19,19 @@ async def stream(query: ReqData):
19
  """
20
  return StreamingResponse(generate(query), media_type='text/event-stream')
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # # @router.post("/followup")
23
  # # def follow_up(req: ReqFollowUp):
24
  # # """
 
19
  """
20
  return StreamingResponse(generate(query), media_type='text/event-stream')
21
 
22
+ # @router.post("/list")
23
+ # def chat(query: ReqData):
24
+ # """
25
+ # Handles the chat POST request.
26
+
27
+ # Args:
28
+ # query (ReqData): The request data containing the query parameters.
29
+
30
+ # Returns:
31
+ # str: The generated response from the chat function.
32
+ # """
33
+ # return generate(query)
34
+
35
  # # @router.post("/followup")
36
  # # def follow_up(req: ReqFollowUp):
37
  # # """
router/mail.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module for defining the main routes of the API."""
2
+ from fastapi import APIRouter
3
+ from fastapi.responses import JSONResponse
4
+ from controllers import mail
5
+
6
+ router = APIRouter(prefix="/mail", tags=["Mail"])
7
+
8
+ @router.post("")
9
+ def collect():
10
+ """
11
+ Handles the chat POST request.
12
+
13
+ Args:
14
+ query (ReqData): The request data containing the query parameters.
15
+
16
+ Returns:
17
+ str: The generated response from the chat function.
18
+ """
19
+ mail.collect()
20
+ return JSONResponse(content={"message": "Mail collected successfully."})
21
+
22
+ @router.get("")
23
+ def get():
24
+ """
25
+ Handles the chat POST request.
26
+
27
+ Args:
28
+ query (ReqData): The request data containing the query parameters.
29
+
30
+ Returns:
31
+ str: The generated response from the chat function.
32
+ """
33
+ result = mail.get()
34
+ return JSONResponse(content={"message": result})
token.pickle CHANGED
Binary files a/token.pickle and b/token.pickle differ