gavinzli commited on
Commit
ad04a72
·
1 Parent(s): 1031c5b

Refactor email processing in mail.py, enhance document handling, and remove obsolete binary files; update embedding model integration in llm module.

Browse files
controllers/mail.py CHANGED
@@ -54,7 +54,8 @@ def list_emails(messages):
54
  metadata['cc'] = header['value']
55
  metadata['date'] = datetime.fromtimestamp(
56
  int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
57
- logger.info(metadata, msg['id'])
 
58
  body = ""
59
  if 'parts' in msg['payload']:
60
  attachment_documents = []
@@ -74,13 +75,22 @@ def list_emails(messages):
74
  attachment_documents = attachment_documents + UnstructuredImageLoader(path).load()
75
  if part['filename'].endswith('.csv'):
76
  attachment_documents = attachment_documents + CSVLoader(path).load()
 
 
77
  for index, document in enumerate(attachment_documents):
78
  _id = f"{msg['id']}_{index}"
79
  if 'source' in document.metadata:
80
  document.metadata['source'] = document.metadata['source'].replace(f"./{ATTACHMENTS_DIR}/", "")
 
81
  document.metadata.update(metadata)
 
82
  ids.append(_id)
 
 
 
83
  else:
 
 
84
  body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
85
  body = re.sub(r'<[^>]+>', '', body) # Remove HTML tags
86
  documents.append(Document(
@@ -88,7 +98,9 @@ def list_emails(messages):
88
  metadata=metadata
89
  ))
90
  ids.append(msg['id'])
91
- return vectorstore.add_documents(documents=documents, ids=ids)
 
 
92
 
93
  def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%m/%d')):
94
  """
@@ -105,7 +117,7 @@ def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%
105
  if emails:
106
  print("Found %d emails:\n", len(emails))
107
  logger.info("Found %d emails after two_weeks_ago:\n", len(emails))
108
- return f"{len(list_emails(emails))} emails added to the collection."
109
  else:
110
  logger.info("No emails found after two weeks ago.")
111
 
@@ -124,6 +136,7 @@ def get_documents():
124
  'documents': data['documents'],
125
  'metadatas': data['metadatas']
126
  })
 
127
  df = pd.concat(
128
  [df.drop('metadatas', axis=1), df['metadatas'].apply(pd.Series)],
129
- axis=1).to_csv('collection_data.csv', index=False)
 
54
  metadata['cc'] = header['value']
55
  metadata['date'] = datetime.fromtimestamp(
56
  int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
57
+ print.info(metadata, msg['id'])
58
+ print("-"*100)
59
  body = ""
60
  if 'parts' in msg['payload']:
61
  attachment_documents = []
 
75
  attachment_documents = attachment_documents + UnstructuredImageLoader(path).load()
76
  if part['filename'].endswith('.csv'):
77
  attachment_documents = attachment_documents + CSVLoader(path).load()
78
+ ids = []
79
+ documents = []
80
  for index, document in enumerate(attachment_documents):
81
  _id = f"{msg['id']}_{index}"
82
  if 'source' in document.metadata:
83
  document.metadata['source'] = document.metadata['source'].replace(f"./{ATTACHMENTS_DIR}/", "")
84
+ print(document.metadata)
85
  document.metadata.update(metadata)
86
+ print(document.metadata)
87
  ids.append(_id)
88
+ print(_id)
89
+ print("*"*100)
90
+ vectorstore.add_documents(documents=documents, ids=ids)
91
  else:
92
+ ids = []
93
+ documents = []
94
  body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
95
  body = re.sub(r'<[^>]+>', '', body) # Remove HTML tags
96
  documents.append(Document(
 
98
  metadata=metadata
99
  ))
100
  ids.append(msg['id'])
101
+ print(msg['id'])
102
+ print("!"*100)
103
+ vectorstore.add_documents(documents=documents, ids=ids)
104
 
105
  def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%m/%d')):
106
  """
 
117
  if emails:
118
  print("Found %d emails:\n", len(emails))
119
  logger.info("Found %d emails after two_weeks_ago:\n", len(emails))
120
+ return f"{len(emails)} emails added to the collection."
121
  else:
122
  logger.info("No emails found after two weeks ago.")
123
 
 
136
  'documents': data['documents'],
137
  'metadatas': data['metadatas']
138
  })
139
+ df.to_excel('collection_data.xlsx', index=False)
140
  df = pd.concat(
141
  [df.drop('metadatas', axis=1), df['metadatas'].apply(pd.Series)],
142
+ axis=1).to_excel('collection_data_expand.xlsx', index=False)
models/chroma/__init__.py CHANGED
@@ -1,48 +1,6 @@
1
  """Module for the Vector Database."""
2
- from typing import List
3
  from langchain_chroma import Chroma
4
- from langchain.embeddings.base import Embeddings
5
- from sentence_transformers import SentenceTransformer
6
-
7
- class EmbeddingsModel(Embeddings):
8
- """
9
- A model for generating embeddings using SentenceTransformer.
10
-
11
- Attributes:
12
- model (SentenceTransformer): The SentenceTransformer model used for generating embeddings.
13
- """
14
- def __init__(self, model_name: str):
15
- """
16
- Initializes the Chroma model with the specified model name.
17
-
18
- Args:
19
- model_name (str): The name of the model to be used for sentence transformation.
20
- """
21
- self.model = SentenceTransformer(model_name)
22
-
23
- def embed_documents(self, documents: List[str]) -> List[List[float]]:
24
- """
25
- Embed a list of documents into a list of vectors.
26
-
27
- Args:
28
- documents (List[str]): A list of documents to be embedded.
29
-
30
- Returns:
31
- List[List[float]]: A list of vectors representing the embedded documents.
32
- """
33
- return self.model.encode(documents).tolist()
34
-
35
- def embed_query(self, query: str) -> List[float]:
36
- """
37
- Embed a query string into a list of floats using the model's encoding.
38
-
39
- Args:
40
- query (str): The query string to be embedded.
41
-
42
- Returns:
43
- List[float]: The embedded representation of the query as a list of floats.
44
- """
45
- return self.model.encode([query]).tolist()[0]
46
 
47
  vectorstore = Chroma(
48
  embedding_function=EmbeddingsModel("all-MiniLM-L6-v2"),
 
1
  """Module for the Vector Database."""
 
2
  from langchain_chroma import Chroma
3
+ from models.llm import EmbeddingsModel
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  vectorstore = Chroma(
6
  embedding_function=EmbeddingsModel("all-MiniLM-L6-v2"),
models/llm/__init__.py CHANGED
@@ -1,6 +1,9 @@
1
  """Module for OpenAI model and embeddings."""
2
  import os
 
3
  import onnxruntime as ort
 
 
4
  from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
5
  from langchain_huggingface import HuggingFacePipeline
6
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
@@ -98,6 +101,46 @@ class HuggingfaceModel(HuggingFacePipeline):
98
  )
99
  )
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  # model_name = "microsoft/phi-1_5"
102
  # tokenizer = AutoTokenizer.from_pretrained(model_name)
103
  # model = AutoModelForCausalLM.from_pretrained(model_name)
 
1
  """Module for OpenAI model and embeddings."""
2
  import os
3
+ from typing import List
4
  import onnxruntime as ort
5
+ from langchain.embeddings.base import Embeddings
6
+ from sentence_transformers import SentenceTransformer
7
  from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
8
  from langchain_huggingface import HuggingFacePipeline
9
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
101
  )
102
  )
103
 
104
+ class EmbeddingsModel(Embeddings):
105
+ """
106
+ A model for generating embeddings using SentenceTransformer.
107
+
108
+ Attributes:
109
+ model (SentenceTransformer): The SentenceTransformer model used for generating embeddings.
110
+ """
111
+ def __init__(self, model_name: str):
112
+ """
113
+ Initializes the Chroma model with the specified model name.
114
+
115
+ Args:
116
+ model_name (str): The name of the model to be used for sentence transformation.
117
+ """
118
+ self.model = SentenceTransformer(model_name)
119
+
120
+ def embed_documents(self, documents: List[str]) -> List[List[float]]:
121
+ """
122
+ Embed a list of documents into a list of vectors.
123
+
124
+ Args:
125
+ documents (List[str]): A list of documents to be embedded.
126
+
127
+ Returns:
128
+ List[List[float]]: A list of vectors representing the embedded documents.
129
+ """
130
+ return self.model.encode(documents).tolist()
131
+
132
+ def embed_query(self, query: str) -> List[float]:
133
+ """
134
+ Embed a query string into a list of floats using the model's encoding.
135
+
136
+ Args:
137
+ query (str): The query string to be embedded.
138
+
139
+ Returns:
140
+ List[float]: The embedded representation of the query as a list of floats.
141
+ """
142
+ return self.model.encode([query]).tolist()[0]
143
+
144
  # model_name = "microsoft/phi-1_5"
145
  # tokenizer = AutoTokenizer.from_pretrained(model_name)
146
  # model = AutoModelForCausalLM.from_pretrained(model_name)
test.py CHANGED
@@ -1,4 +1,5 @@
1
  from controllers import mail
2
 
3
  if __name__ == "__main__":
4
- mail.collect()
 
 
1
  from controllers import mail
2
 
3
  if __name__ == "__main__":
4
+ mail.collect()
5
+ mail.get_documents()
token.pickle CHANGED
Binary files a/token.pickle and b/token.pickle differ