Spaces:

Oxbridge-Economics
/

Mailbox

Running

App Files Files Community

gavinzli commited on Mar 29

Commit

1031c5b

1 Parent(s): c7426d8

Add new test script, update email processing, and adjust retriever settings

Browse files

Files changed (6) hide show

controllers/mail.py +8 -8
retriever/__init__.py +20 -17
router/main.py +2 -2
test.py +4 -0
token.pickle +0 -0
utils.py +1 -0

controllers/mail.py CHANGED Viewed

@@ -40,7 +40,7 @@ def list_emails(messages):
     """List emails from the search results and download attachments."""
     ids = []
     documents = []
-    for message in messages[:100]:
         msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()
         metadata = {}
         for header in msg['payload']['headers']:
@@ -54,8 +54,10 @@ def list_emails(messages):
                 metadata['cc'] = header['value']
         metadata['date'] = datetime.fromtimestamp(
             int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
         body = ""
         if 'parts' in msg['payload']:
             for part in msg['payload']['parts']:
                 if part['filename']:
                     attachment_id = part['body']['attachmentId']
@@ -67,15 +69,12 @@ def list_emails(messages):
                     with open(path, 'wb') as f:
                         f.write(file_data)
                     if part['filename'].endswith('.pdf'):
-                        attachment_documents = PyPDFLoader(path).load()
-                        documents = documents + attachment_documents
                     if part['filename'].endswith('.png'):
-                        attachment_documents = UnstructuredImageLoader(path).load()
-                        documents = documents + attachment_documents
                     if part['filename'].endswith('.csv'):
-                        attachment_documents = CSVLoader(path).load()
-                        documents = documents + attachment_documents
-            for index, document in enumerate(documents):
                 _id = f"{msg['id']}_{index}"
                 if 'source' in document.metadata:
                     document.metadata['source'] = document.metadata['source'].replace(f"./{ATTACHMENTS_DIR}/", "")
@@ -104,6 +103,7 @@ def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%
     """
     emails = search_emails(query)
     if emails:
         logger.info("Found %d emails after two_weeks_ago:\n", len(emails))
         return f"{len(list_emails(emails))} emails added to the collection."
     else:

     """List emails from the search results and download attachments."""
     ids = []
     documents = []
+    for message in messages:
         msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()
         metadata = {}
         for header in msg['payload']['headers']:
                 metadata['cc'] = header['value']
         metadata['date'] = datetime.fromtimestamp(
             int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
+        logger.info(metadata, msg['id'])
         body = ""
         if 'parts' in msg['payload']:
+            attachment_documents = []
             for part in msg['payload']['parts']:
                 if part['filename']:
                     attachment_id = part['body']['attachmentId']
                     with open(path, 'wb') as f:
                         f.write(file_data)
                     if part['filename'].endswith('.pdf'):
+                        attachment_documents = attachment_documents + PyPDFLoader(path).load()
                     if part['filename'].endswith('.png'):
+                        attachment_documents = attachment_documents + UnstructuredImageLoader(path).load()
                     if part['filename'].endswith('.csv'):
+                        attachment_documents = attachment_documents + CSVLoader(path).load()
+            for index, document in enumerate(attachment_documents):
                 _id = f"{msg['id']}_{index}"
                 if 'source' in document.metadata:
                     document.metadata['source'] = document.metadata['source'].replace(f"./{ATTACHMENTS_DIR}/", "")
     """
     emails = search_emails(query)
     if emails:
+        print("Found %d emails:\n", len(emails))
         logger.info("Found %d emails after two_weeks_ago:\n", len(emails))
         return f"{len(list_emails(emails))} emails added to the collection."
     else:

retriever/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Module for retrievers that fetch documents from various sources."""
 from venv import logger
 from langchain_core.retrievers import BaseRetriever
 from langchain_core.vectorstores import VectorStoreRetriever
@@ -23,9 +24,9 @@ class DocRetriever(BaseRetriever):
                 list: A list of Document objects with relevant metadata.
     """
     retriever: VectorStoreRetriever = None
-    k: int = 3
-    def __init__(self, req, k: int = 3) -> None:
         super().__init__()
         # _filter={}
         # if req.site != []:
@@ -44,24 +45,26 @@ class DocRetriever(BaseRetriever):
     def _get_relevant_documents(self, query: str, *, run_manager) -> list:
         try:
             retrieved_docs = self.retriever.invoke(query)
-            doc_lst = []
             for doc in retrieved_docs:
                 # date = str(doc.metadata['publishDate'])
-                doc_lst.append(Document(
-                    page_content = doc.page_content,
-                    metadata = {
-                        "content": doc.page_content,
-                        # "id": doc.metadata['id'],
-                        "title": doc.metadata['subject'],
-                        # "site": doc.metadata['site'],
-                        # "link": doc.metadata['link'],
-                        # "publishDate": doc.metadata['publishDate'].strftime('%Y-%m-%d'),
-                        # 'web': False,
-                        # "source": "Finfast"
-                    }
-                ))
             # print(doc_lst)
-            return doc_lst
         except RuntimeError as e:
             logger.error("Error retrieving documents: %s", e)
             return []

 """Module for retrievers that fetch documents from various sources."""
+from importlib import metadata
 from venv import logger
 from langchain_core.retrievers import BaseRetriever
 from langchain_core.vectorstores import VectorStoreRetriever
                 list: A list of Document objects with relevant metadata.
     """
     retriever: VectorStoreRetriever = None
+    k: int = 10
+    def __init__(self, req, k: int = 10) -> None:
         super().__init__()
         # _filter={}
         # if req.site != []:
     def _get_relevant_documents(self, query: str, *, run_manager) -> list:
         try:
             retrieved_docs = self.retriever.invoke(query)
+            # doc_lst = []
             for doc in retrieved_docs:
                 # date = str(doc.metadata['publishDate'])
+                doc.metadata['content'] = doc.page_content
+                # doc_lst.append(Document(
+                #     page_content = doc.page_content,
+                #     metadata = doc.metadata
+                #     # metadata = {
+                #     #     "content": doc.page_content,
+                #     #     # "id": doc.metadata['id'],
+                #     #     "title": doc.metadata['subject'],
+                #     #     # "site": doc.metadata['site'],
+                #     #     # "link": doc.metadata['link'],
+                #     #     # "publishDate": doc.metadata['publishDate'].strftime('%Y-%m-%d'),
+                #     #     # 'web': False,
+                #     #     # "source": "Finfast"
+                #     # }
+                # ))
             # print(doc_lst)
+            return retrieved_docs
         except RuntimeError as e:
             logger.error("Error retrieving documents: %s", e)
             return []

router/main.py CHANGED Viewed

@@ -37,11 +37,11 @@ async def stream(query: ReqData):
 #     """
 #     Endpoint to retrieve chat history.
-#     This endpoint handles POST requests to the "/chat/history" URL. It accepts a
 #     ChatHistory object as input and returns the chat history.
 #     Args:
-#         chat_history (ChatHistory): The chat history object containing the details
 #         of the chat to be retrieved.
 #     Returns:

 #     """
 #     Endpoint to retrieve chat history.
+#     This endpoint handles POST requests to the "/chat/history" URL. It accepts a
 #     ChatHistory object as input and returns the chat history.
 #     Args:
+#         chat_history (ChatHistory): The chat history object containing the details
 #         of the chat to be retrieved.
 #     Returns:

test.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from controllers import mail
+if __name__ == "__main__":
+    mail.collect()

token.pickle CHANGED Viewed

Binary files a/token.pickle and b/token.pickle differ

utils.py CHANGED Viewed

@@ -28,6 +28,7 @@ async def generate(req: ReqData):
         if 'answer' in chunk:
             yield "event: answer\n"
             yield f"data: {json.dumps(chunk)}\n\n"
         elif 'context' in chunk:
             for context in chunk['context']:
                 yield "event: context\n"

         if 'answer' in chunk:
             yield "event: answer\n"
             yield f"data: {json.dumps(chunk)}\n\n"
+            print(chunk['answer'], end="", flush=True)
         elif 'context' in chunk:
             for context in chunk['context']:
                 yield "event: context\n"