Spaces:

Oxbridge-Economics
/

Mailbox

Sleeping

App Files Files Community

gavinzli commited on Mar 29

Commit

ad04a72

1 Parent(s): 1031c5b

Refactor email processing in mail.py, enhance document handling, and remove obsolete binary files; update embedding model integration in llm module.

Browse files

Files changed (5) hide show

controllers/mail.py +17 -4
models/chroma/__init__.py +1 -43
models/llm/__init__.py +43 -0
test.py +2 -1
token.pickle +0 -0

controllers/mail.py CHANGED Viewed

@@ -54,7 +54,8 @@ def list_emails(messages):
                 metadata['cc'] = header['value']
         metadata['date'] = datetime.fromtimestamp(
             int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
-        logger.info(metadata, msg['id'])
         body = ""
         if 'parts' in msg['payload']:
             attachment_documents = []
@@ -74,13 +75,22 @@ def list_emails(messages):
                         attachment_documents = attachment_documents + UnstructuredImageLoader(path).load()
                     if part['filename'].endswith('.csv'):
                         attachment_documents = attachment_documents + CSVLoader(path).load()
             for index, document in enumerate(attachment_documents):
                 _id = f"{msg['id']}_{index}"
                 if 'source' in document.metadata:
                     document.metadata['source'] = document.metadata['source'].replace(f"./{ATTACHMENTS_DIR}/", "")
                 document.metadata.update(metadata)
                 ids.append(_id)
         else:
             body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
             body = re.sub(r'<[^>]+>', '', body)  # Remove HTML tags
             documents.append(Document(
@@ -88,7 +98,9 @@ def list_emails(messages):
                 metadata=metadata
             ))
             ids.append(msg['id'])
-    return vectorstore.add_documents(documents=documents, ids=ids)
 def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%m/%d')):
     """
@@ -105,7 +117,7 @@ def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%
     if emails:
         print("Found %d emails:\n", len(emails))
         logger.info("Found %d emails after two_weeks_ago:\n", len(emails))
-        return f"{len(list_emails(emails))} emails added to the collection."
     else:
         logger.info("No emails found after two weeks ago.")
@@ -124,6 +136,7 @@ def get_documents():
         'documents': data['documents'],
         'metadatas': data['metadatas']
     })
     df = pd.concat(
         [df.drop('metadatas', axis=1), df['metadatas'].apply(pd.Series)],
-        axis=1).to_csv('collection_data.csv', index=False)

                 metadata['cc'] = header['value']
         metadata['date'] = datetime.fromtimestamp(
             int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
+        print.info(metadata, msg['id'])
+        print("-"*100)
         body = ""
         if 'parts' in msg['payload']:
             attachment_documents = []
                         attachment_documents = attachment_documents + UnstructuredImageLoader(path).load()
                     if part['filename'].endswith('.csv'):
                         attachment_documents = attachment_documents + CSVLoader(path).load()
+            ids = []
+            documents = []
             for index, document in enumerate(attachment_documents):
                 _id = f"{msg['id']}_{index}"
                 if 'source' in document.metadata:
                     document.metadata['source'] = document.metadata['source'].replace(f"./{ATTACHMENTS_DIR}/", "")
+                print(document.metadata)
                 document.metadata.update(metadata)
+                print(document.metadata)
                 ids.append(_id)
+                print(_id)
+                print("*"*100)
+                vectorstore.add_documents(documents=documents, ids=ids)
         else:
+            ids = []
+            documents = []
             body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
             body = re.sub(r'<[^>]+>', '', body)  # Remove HTML tags
             documents.append(Document(
                 metadata=metadata
             ))
             ids.append(msg['id'])
+            print(msg['id'])
+            print("!"*100)
+            vectorstore.add_documents(documents=documents, ids=ids)
 def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%m/%d')):
     """
     if emails:
         print("Found %d emails:\n", len(emails))
         logger.info("Found %d emails after two_weeks_ago:\n", len(emails))
+        return f"{len(emails)} emails added to the collection."
     else:
         logger.info("No emails found after two weeks ago.")
         'documents': data['documents'],
         'metadatas': data['metadatas']
     })
+    df.to_excel('collection_data.xlsx', index=False)
     df = pd.concat(
         [df.drop('metadatas', axis=1), df['metadatas'].apply(pd.Series)],
+        axis=1).to_excel('collection_data_expand.xlsx', index=False)

models/chroma/__init__.py CHANGED Viewed

@@ -1,48 +1,6 @@
 """Module for the Vector Database."""
-from typing import List
 from langchain_chroma import Chroma
-from langchain.embeddings.base import Embeddings
-from sentence_transformers import SentenceTransformer
-class EmbeddingsModel(Embeddings):
-    """
-    A model for generating embeddings using SentenceTransformer.
-    Attributes:
-        model (SentenceTransformer): The SentenceTransformer model used for generating embeddings.
-    """
-    def __init__(self, model_name: str):
-        """
-        Initializes the Chroma model with the specified model name.
-        Args:
-            model_name (str): The name of the model to be used for sentence transformation.
-        """
-        self.model = SentenceTransformer(model_name)
-    def embed_documents(self, documents: List[str]) -> List[List[float]]:
-        """
-        Embed a list of documents into a list of vectors.
-        Args:
-            documents (List[str]): A list of documents to be embedded.
-        Returns:
-            List[List[float]]: A list of vectors representing the embedded documents.
-        """
-        return self.model.encode(documents).tolist()
-    def embed_query(self, query: str) -> List[float]:
-        """
-        Embed a query string into a list of floats using the model's encoding.
-        Args:
-            query (str): The query string to be embedded.
-        Returns:
-            List[float]: The embedded representation of the query as a list of floats.
-        """
-        return self.model.encode([query]).tolist()[0]
 vectorstore = Chroma(
     embedding_function=EmbeddingsModel("all-MiniLM-L6-v2"),

 """Module for the Vector Database."""
 from langchain_chroma import Chroma
+from models.llm import EmbeddingsModel
 vectorstore = Chroma(
     embedding_function=EmbeddingsModel("all-MiniLM-L6-v2"),

models/llm/__init__.py CHANGED Viewed

@@ -1,6 +1,9 @@
 """Module for OpenAI model and embeddings."""
 import os
 import onnxruntime as ort
 from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
 from langchain_huggingface import HuggingFacePipeline
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
@@ -98,6 +101,46 @@ class HuggingfaceModel(HuggingFacePipeline):
             )
         )
 # model_name = "microsoft/phi-1_5"
 # tokenizer = AutoTokenizer.from_pretrained(model_name)
 # model = AutoModelForCausalLM.from_pretrained(model_name)

 """Module for OpenAI model and embeddings."""
 import os
+from typing import List
 import onnxruntime as ort
+from langchain.embeddings.base import Embeddings
+from sentence_transformers import SentenceTransformer
 from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
 from langchain_huggingface import HuggingFacePipeline
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
             )
         )
+class EmbeddingsModel(Embeddings):
+    """
+    A model for generating embeddings using SentenceTransformer.
+    Attributes:
+        model (SentenceTransformer): The SentenceTransformer model used for generating embeddings.
+    """
+    def __init__(self, model_name: str):
+        """
+        Initializes the Chroma model with the specified model name.
+        Args:
+            model_name (str): The name of the model to be used for sentence transformation.
+        """
+        self.model = SentenceTransformer(model_name)
+    def embed_documents(self, documents: List[str]) -> List[List[float]]:
+        """
+        Embed a list of documents into a list of vectors.
+        Args:
+            documents (List[str]): A list of documents to be embedded.
+        Returns:
+            List[List[float]]: A list of vectors representing the embedded documents.
+        """
+        return self.model.encode(documents).tolist()
+    def embed_query(self, query: str) -> List[float]:
+        """
+        Embed a query string into a list of floats using the model's encoding.
+        Args:
+            query (str): The query string to be embedded.
+        Returns:
+            List[float]: The embedded representation of the query as a list of floats.
+        """
+        return self.model.encode([query]).tolist()[0]
 # model_name = "microsoft/phi-1_5"
 # tokenizer = AutoTokenizer.from_pretrained(model_name)
 # model = AutoModelForCausalLM.from_pretrained(model_name)

test.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from controllers import mail
 if __name__ == "__main__":
-    mail.collect()

 from controllers import mail
 if __name__ == "__main__":
+    mail.collect()
+    mail.get_documents()

token.pickle CHANGED Viewed

Binary files a/token.pickle and b/token.pickle differ