gavinzli commited on
Commit
8dae98c
·
1 Parent(s): 46459dd

Refactor email metadata handling in list_emails function and update vectorstore initialization with Pinecone integration

Browse files
Files changed (2) hide show
  1. app/controllers/mail.py +12 -10
  2. app/models/db/__init__.py +27 -9
app/controllers/mail.py CHANGED
@@ -77,11 +77,13 @@ def list_emails(service, messages):
77
  for message in messages:
78
  msg = service.users().messages().get(userId="me", id=message["id"], format="full").execute()
79
  metadata = {}
80
- metadata["msg_id"] = f"{msg['threadId']}-{msg['id']}"
81
- for docstore_id in list(vectorstore.index_to_docstore_id.values()):
82
- if docstore_id.startswith(metadata["msg_id"]):
83
- logger.info("Already indexed: %s", metadata["msg_id"])
84
- continue
 
 
85
  for header in msg["payload"]["headers"]:
86
  if header["name"] == "From":
87
  metadata["from"] = header["value"]
@@ -95,7 +97,7 @@ def list_emails(service, messages):
95
  metadata["date"] = datetime.fromtimestamp(int(msg["internalDate"]) / 1000).strftime(
96
  "%d/%m/%Y %H:%M:%S"
97
  )
98
- metadata["user_id"] = service.users().getProfile(userId="me").execute().get("emailAddress")
99
  # print(metadata, msg["payload"]["mimeType"])
100
  ids = []
101
  documents = []
@@ -170,7 +172,7 @@ def list_emails(service, messages):
170
  },
171
  )
172
  )
173
- ids.append(f"{metadata['msg_id']}-{part['filename']}-{hashlib.sha256(file_data).hexdigest()}")
174
  if os.path.exists(path):
175
  os.remove(path)
176
  for index, document in enumerate(attach_docs or []):
@@ -185,19 +187,19 @@ def list_emails(service, messages):
185
  }
186
  document.metadata.update(metadata)
187
  documents.append(document)
188
- ids.append(f"{metadata['msg_id']}-{hashlib.sha256(file_data).hexdigest()}-{index}")
189
  elif msg["payload"]["mimeType"] == "text/plain" and "data" in msg["payload"]["body"]:
190
  body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
191
  body = re.sub(r"<[^>]+>", "", body)
192
  metadata["mimeType"] = msg["payload"]["mimeType"]
193
  documents.append(Document(page_content=body, metadata=metadata))
194
- ids.append(metadata["msg_id"])
195
  elif msg["payload"]["mimeType"] == "text/html" and "data" in msg["payload"]["body"]:
196
  body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
197
  body = re.sub(r"<[^>]+>", "", body)
198
  metadata["mimeType"] = msg["payload"]["mimeType"]
199
  documents.append(Document(page_content=body, metadata=metadata))
200
- ids.append(metadata["msg_id"])
201
  if "multipart/alternative" in mime_types and len(mime_types) == 1:
202
  print("Only multipart/alternative found in the email.")
203
  else:
 
77
  for message in messages:
78
  msg = service.users().messages().get(userId="me", id=message["id"], format="full").execute()
79
  metadata = {}
80
+ metadata["threadId"] = msg["threadId"]
81
+ metadata["msgId"] = msg["id"]
82
+ msgId = f"{msg['threadId']}-{msg['id']}"
83
+ # for docstore_id in list(vectorstore.index_to_docstore_id.values()):
84
+ # if docstore_id.startswith(msgId):
85
+ # logger.info("Already indexed: %s", msgId)
86
+ # continue
87
  for header in msg["payload"]["headers"]:
88
  if header["name"] == "From":
89
  metadata["from"] = header["value"]
 
97
  metadata["date"] = datetime.fromtimestamp(int(msg["internalDate"]) / 1000).strftime(
98
  "%d/%m/%Y %H:%M:%S"
99
  )
100
+ metadata["userId"] = service.users().getProfile(userId="me").execute().get("emailAddress")
101
  # print(metadata, msg["payload"]["mimeType"])
102
  ids = []
103
  documents = []
 
172
  },
173
  )
174
  )
175
+ ids.append(f"{msgId}-{part['filename']}-{hashlib.sha256(file_data).hexdigest()}")
176
  if os.path.exists(path):
177
  os.remove(path)
178
  for index, document in enumerate(attach_docs or []):
 
187
  }
188
  document.metadata.update(metadata)
189
  documents.append(document)
190
+ ids.append(f"{msgId}-{hashlib.sha256(file_data).hexdigest()}-{index}")
191
  elif msg["payload"]["mimeType"] == "text/plain" and "data" in msg["payload"]["body"]:
192
  body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
193
  body = re.sub(r"<[^>]+>", "", body)
194
  metadata["mimeType"] = msg["payload"]["mimeType"]
195
  documents.append(Document(page_content=body, metadata=metadata))
196
+ ids.append(msgId)
197
  elif msg["payload"]["mimeType"] == "text/html" and "data" in msg["payload"]["body"]:
198
  body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
199
  body = re.sub(r"<[^>]+>", "", body)
200
  metadata["mimeType"] = msg["payload"]["mimeType"]
201
  documents.append(Document(page_content=body, metadata=metadata))
202
+ ids.append(msgId)
203
  if "multipart/alternative" in mime_types and len(mime_types) == 1:
204
  print("Only multipart/alternative found in the email.")
205
  else:
app/models/db/__init__.py CHANGED
@@ -1,14 +1,32 @@
1
  """This module is responsible for initializing the database connection and creating the necessary tables."""
2
- import faiss
3
- from langchain_community.vectorstores import FAISS
4
- from langchain_community.docstore.in_memory import InMemoryDocstore
 
 
5
  from models.llm import EmbeddingsModel
6
 
7
  embeddings = EmbeddingsModel("all-MiniLM-L6-v2")
8
 
9
- vectorstore = FAISS(
10
- embedding_function=embeddings,
11
- index=faiss.IndexFlatL2(len(embeddings.embed_query("hello world"))),
12
- docstore=InMemoryDocstore(),
13
- index_to_docstore_id={}
14
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """This module is responsible for initializing the database connection and creating the necessary tables."""
2
+ # import faiss
3
+ from pinecone import Pinecone, ServerlessSpec
4
+ # from langchain_community.vectorstores import FAISS
5
+ # from langchain_community.docstore.in_memory import InMemoryDocstore
6
+ from langchain_pinecone import PineconeVectorStore
7
  from models.llm import EmbeddingsModel
8
 
9
  embeddings = EmbeddingsModel("all-MiniLM-L6-v2")
10
 
11
+ # vectorstore = FAISS(
12
+ # embedding_function=embeddings,
13
+ # index=faiss.IndexFlatL2(len(embeddings.embed_query("hello world"))),
14
+ # docstore=InMemoryDocstore(),
15
+ # index_to_docstore_id={}
16
+ # )
17
+
18
+ pc = Pinecone()
19
+ index_name = "mails"
20
+ embedding_dim = len(embeddings.embed_query("hello world"))
21
+ if not pc.has_index(index_name):
22
+ pc.create_index(
23
+ name=index_name,
24
+ dimension=embedding_dim, # Replace with your model dimensions
25
+ metric="cosine", # Replace with your model metric
26
+ spec=ServerlessSpec(
27
+ cloud="aws",
28
+ region="us-east-1"
29
+ )
30
+ )
31
+ index = pc.Index(index_name)
32
+ vectorstore = PineconeVectorStore(index=index, embedding=embeddings)