Spaces:
Sleeping
Sleeping
Refactor email processing in mail.py, enhance document handling, and remove obsolete binary files; update embedding model integration in llm module.
Browse files- controllers/mail.py +17 -4
- models/chroma/__init__.py +1 -43
- models/llm/__init__.py +43 -0
- test.py +2 -1
- token.pickle +0 -0
controllers/mail.py
CHANGED
@@ -54,7 +54,8 @@ def list_emails(messages):
|
|
54 |
metadata['cc'] = header['value']
|
55 |
metadata['date'] = datetime.fromtimestamp(
|
56 |
int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
|
57 |
-
|
|
|
58 |
body = ""
|
59 |
if 'parts' in msg['payload']:
|
60 |
attachment_documents = []
|
@@ -74,13 +75,22 @@ def list_emails(messages):
|
|
74 |
attachment_documents = attachment_documents + UnstructuredImageLoader(path).load()
|
75 |
if part['filename'].endswith('.csv'):
|
76 |
attachment_documents = attachment_documents + CSVLoader(path).load()
|
|
|
|
|
77 |
for index, document in enumerate(attachment_documents):
|
78 |
_id = f"{msg['id']}_{index}"
|
79 |
if 'source' in document.metadata:
|
80 |
document.metadata['source'] = document.metadata['source'].replace(f"./{ATTACHMENTS_DIR}/", "")
|
|
|
81 |
document.metadata.update(metadata)
|
|
|
82 |
ids.append(_id)
|
|
|
|
|
|
|
83 |
else:
|
|
|
|
|
84 |
body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
|
85 |
body = re.sub(r'<[^>]+>', '', body) # Remove HTML tags
|
86 |
documents.append(Document(
|
@@ -88,7 +98,9 @@ def list_emails(messages):
|
|
88 |
metadata=metadata
|
89 |
))
|
90 |
ids.append(msg['id'])
|
91 |
-
|
|
|
|
|
92 |
|
93 |
def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%m/%d')):
|
94 |
"""
|
@@ -105,7 +117,7 @@ def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%
|
|
105 |
if emails:
|
106 |
print("Found %d emails:\n", len(emails))
|
107 |
logger.info("Found %d emails after two_weeks_ago:\n", len(emails))
|
108 |
-
return f"{len(
|
109 |
else:
|
110 |
logger.info("No emails found after two weeks ago.")
|
111 |
|
@@ -124,6 +136,7 @@ def get_documents():
|
|
124 |
'documents': data['documents'],
|
125 |
'metadatas': data['metadatas']
|
126 |
})
|
|
|
127 |
df = pd.concat(
|
128 |
[df.drop('metadatas', axis=1), df['metadatas'].apply(pd.Series)],
|
129 |
-
axis=1).
|
|
|
54 |
metadata['cc'] = header['value']
|
55 |
metadata['date'] = datetime.fromtimestamp(
|
56 |
int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
|
57 |
+
print.info(metadata, msg['id'])
|
58 |
+
print("-"*100)
|
59 |
body = ""
|
60 |
if 'parts' in msg['payload']:
|
61 |
attachment_documents = []
|
|
|
75 |
attachment_documents = attachment_documents + UnstructuredImageLoader(path).load()
|
76 |
if part['filename'].endswith('.csv'):
|
77 |
attachment_documents = attachment_documents + CSVLoader(path).load()
|
78 |
+
ids = []
|
79 |
+
documents = []
|
80 |
for index, document in enumerate(attachment_documents):
|
81 |
_id = f"{msg['id']}_{index}"
|
82 |
if 'source' in document.metadata:
|
83 |
document.metadata['source'] = document.metadata['source'].replace(f"./{ATTACHMENTS_DIR}/", "")
|
84 |
+
print(document.metadata)
|
85 |
document.metadata.update(metadata)
|
86 |
+
print(document.metadata)
|
87 |
ids.append(_id)
|
88 |
+
print(_id)
|
89 |
+
print("*"*100)
|
90 |
+
vectorstore.add_documents(documents=documents, ids=ids)
|
91 |
else:
|
92 |
+
ids = []
|
93 |
+
documents = []
|
94 |
body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
|
95 |
body = re.sub(r'<[^>]+>', '', body) # Remove HTML tags
|
96 |
documents.append(Document(
|
|
|
98 |
metadata=metadata
|
99 |
))
|
100 |
ids.append(msg['id'])
|
101 |
+
print(msg['id'])
|
102 |
+
print("!"*100)
|
103 |
+
vectorstore.add_documents(documents=documents, ids=ids)
|
104 |
|
105 |
def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%m/%d')):
|
106 |
"""
|
|
|
117 |
if emails:
|
118 |
print("Found %d emails:\n", len(emails))
|
119 |
logger.info("Found %d emails after two_weeks_ago:\n", len(emails))
|
120 |
+
return f"{len(emails)} emails added to the collection."
|
121 |
else:
|
122 |
logger.info("No emails found after two weeks ago.")
|
123 |
|
|
|
136 |
'documents': data['documents'],
|
137 |
'metadatas': data['metadatas']
|
138 |
})
|
139 |
+
df.to_excel('collection_data.xlsx', index=False)
|
140 |
df = pd.concat(
|
141 |
[df.drop('metadatas', axis=1), df['metadatas'].apply(pd.Series)],
|
142 |
+
axis=1).to_excel('collection_data_expand.xlsx', index=False)
|
models/chroma/__init__.py
CHANGED
@@ -1,48 +1,6 @@
|
|
1 |
"""Module for the Vector Database."""
|
2 |
-
from typing import List
|
3 |
from langchain_chroma import Chroma
|
4 |
-
from
|
5 |
-
from sentence_transformers import SentenceTransformer
|
6 |
-
|
7 |
-
class EmbeddingsModel(Embeddings):
|
8 |
-
"""
|
9 |
-
A model for generating embeddings using SentenceTransformer.
|
10 |
-
|
11 |
-
Attributes:
|
12 |
-
model (SentenceTransformer): The SentenceTransformer model used for generating embeddings.
|
13 |
-
"""
|
14 |
-
def __init__(self, model_name: str):
|
15 |
-
"""
|
16 |
-
Initializes the Chroma model with the specified model name.
|
17 |
-
|
18 |
-
Args:
|
19 |
-
model_name (str): The name of the model to be used for sentence transformation.
|
20 |
-
"""
|
21 |
-
self.model = SentenceTransformer(model_name)
|
22 |
-
|
23 |
-
def embed_documents(self, documents: List[str]) -> List[List[float]]:
|
24 |
-
"""
|
25 |
-
Embed a list of documents into a list of vectors.
|
26 |
-
|
27 |
-
Args:
|
28 |
-
documents (List[str]): A list of documents to be embedded.
|
29 |
-
|
30 |
-
Returns:
|
31 |
-
List[List[float]]: A list of vectors representing the embedded documents.
|
32 |
-
"""
|
33 |
-
return self.model.encode(documents).tolist()
|
34 |
-
|
35 |
-
def embed_query(self, query: str) -> List[float]:
|
36 |
-
"""
|
37 |
-
Embed a query string into a list of floats using the model's encoding.
|
38 |
-
|
39 |
-
Args:
|
40 |
-
query (str): The query string to be embedded.
|
41 |
-
|
42 |
-
Returns:
|
43 |
-
List[float]: The embedded representation of the query as a list of floats.
|
44 |
-
"""
|
45 |
-
return self.model.encode([query]).tolist()[0]
|
46 |
|
47 |
vectorstore = Chroma(
|
48 |
embedding_function=EmbeddingsModel("all-MiniLM-L6-v2"),
|
|
|
1 |
"""Module for the Vector Database."""
|
|
|
2 |
from langchain_chroma import Chroma
|
3 |
+
from models.llm import EmbeddingsModel
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
vectorstore = Chroma(
|
6 |
embedding_function=EmbeddingsModel("all-MiniLM-L6-v2"),
|
models/llm/__init__.py
CHANGED
@@ -1,6 +1,9 @@
|
|
1 |
"""Module for OpenAI model and embeddings."""
|
2 |
import os
|
|
|
3 |
import onnxruntime as ort
|
|
|
|
|
4 |
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
|
5 |
from langchain_huggingface import HuggingFacePipeline
|
6 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
@@ -98,6 +101,46 @@ class HuggingfaceModel(HuggingFacePipeline):
|
|
98 |
)
|
99 |
)
|
100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
# model_name = "microsoft/phi-1_5"
|
102 |
# tokenizer = AutoTokenizer.from_pretrained(model_name)
|
103 |
# model = AutoModelForCausalLM.from_pretrained(model_name)
|
|
|
1 |
"""Module for OpenAI model and embeddings."""
|
2 |
import os
|
3 |
+
from typing import List
|
4 |
import onnxruntime as ort
|
5 |
+
from langchain.embeddings.base import Embeddings
|
6 |
+
from sentence_transformers import SentenceTransformer
|
7 |
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
|
8 |
from langchain_huggingface import HuggingFacePipeline
|
9 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
|
|
101 |
)
|
102 |
)
|
103 |
|
104 |
+
class EmbeddingsModel(Embeddings):
|
105 |
+
"""
|
106 |
+
A model for generating embeddings using SentenceTransformer.
|
107 |
+
|
108 |
+
Attributes:
|
109 |
+
model (SentenceTransformer): The SentenceTransformer model used for generating embeddings.
|
110 |
+
"""
|
111 |
+
def __init__(self, model_name: str):
|
112 |
+
"""
|
113 |
+
Initializes the Chroma model with the specified model name.
|
114 |
+
|
115 |
+
Args:
|
116 |
+
model_name (str): The name of the model to be used for sentence transformation.
|
117 |
+
"""
|
118 |
+
self.model = SentenceTransformer(model_name)
|
119 |
+
|
120 |
+
def embed_documents(self, documents: List[str]) -> List[List[float]]:
|
121 |
+
"""
|
122 |
+
Embed a list of documents into a list of vectors.
|
123 |
+
|
124 |
+
Args:
|
125 |
+
documents (List[str]): A list of documents to be embedded.
|
126 |
+
|
127 |
+
Returns:
|
128 |
+
List[List[float]]: A list of vectors representing the embedded documents.
|
129 |
+
"""
|
130 |
+
return self.model.encode(documents).tolist()
|
131 |
+
|
132 |
+
def embed_query(self, query: str) -> List[float]:
|
133 |
+
"""
|
134 |
+
Embed a query string into a list of floats using the model's encoding.
|
135 |
+
|
136 |
+
Args:
|
137 |
+
query (str): The query string to be embedded.
|
138 |
+
|
139 |
+
Returns:
|
140 |
+
List[float]: The embedded representation of the query as a list of floats.
|
141 |
+
"""
|
142 |
+
return self.model.encode([query]).tolist()[0]
|
143 |
+
|
144 |
# model_name = "microsoft/phi-1_5"
|
145 |
# tokenizer = AutoTokenizer.from_pretrained(model_name)
|
146 |
# model = AutoModelForCausalLM.from_pretrained(model_name)
|
test.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from controllers import mail
|
2 |
|
3 |
if __name__ == "__main__":
|
4 |
-
mail.collect()
|
|
|
|
1 |
from controllers import mail
|
2 |
|
3 |
if __name__ == "__main__":
|
4 |
+
mail.collect()
|
5 |
+
mail.get_documents()
|
token.pickle
CHANGED
Binary files a/token.pickle and b/token.pickle differ
|
|