Spaces:
Running
Running
Add new test script, update email processing, and adjust retriever settings
Browse files- controllers/mail.py +8 -8
- retriever/__init__.py +20 -17
- router/main.py +2 -2
- test.py +4 -0
- token.pickle +0 -0
- utils.py +1 -0
controllers/mail.py
CHANGED
@@ -40,7 +40,7 @@ def list_emails(messages):
|
|
40 |
"""List emails from the search results and download attachments."""
|
41 |
ids = []
|
42 |
documents = []
|
43 |
-
for message in messages
|
44 |
msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()
|
45 |
metadata = {}
|
46 |
for header in msg['payload']['headers']:
|
@@ -54,8 +54,10 @@ def list_emails(messages):
|
|
54 |
metadata['cc'] = header['value']
|
55 |
metadata['date'] = datetime.fromtimestamp(
|
56 |
int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
|
|
|
57 |
body = ""
|
58 |
if 'parts' in msg['payload']:
|
|
|
59 |
for part in msg['payload']['parts']:
|
60 |
if part['filename']:
|
61 |
attachment_id = part['body']['attachmentId']
|
@@ -67,15 +69,12 @@ def list_emails(messages):
|
|
67 |
with open(path, 'wb') as f:
|
68 |
f.write(file_data)
|
69 |
if part['filename'].endswith('.pdf'):
|
70 |
-
attachment_documents = PyPDFLoader(path).load()
|
71 |
-
documents = documents + attachment_documents
|
72 |
if part['filename'].endswith('.png'):
|
73 |
-
attachment_documents = UnstructuredImageLoader(path).load()
|
74 |
-
documents = documents + attachment_documents
|
75 |
if part['filename'].endswith('.csv'):
|
76 |
-
attachment_documents = CSVLoader(path).load()
|
77 |
-
|
78 |
-
for index, document in enumerate(documents):
|
79 |
_id = f"{msg['id']}_{index}"
|
80 |
if 'source' in document.metadata:
|
81 |
document.metadata['source'] = document.metadata['source'].replace(f"./{ATTACHMENTS_DIR}/", "")
|
@@ -104,6 +103,7 @@ def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%
|
|
104 |
"""
|
105 |
emails = search_emails(query)
|
106 |
if emails:
|
|
|
107 |
logger.info("Found %d emails after two_weeks_ago:\n", len(emails))
|
108 |
return f"{len(list_emails(emails))} emails added to the collection."
|
109 |
else:
|
|
|
40 |
"""List emails from the search results and download attachments."""
|
41 |
ids = []
|
42 |
documents = []
|
43 |
+
for message in messages:
|
44 |
msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()
|
45 |
metadata = {}
|
46 |
for header in msg['payload']['headers']:
|
|
|
54 |
metadata['cc'] = header['value']
|
55 |
metadata['date'] = datetime.fromtimestamp(
|
56 |
int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
|
57 |
+
logger.info(metadata, msg['id'])
|
58 |
body = ""
|
59 |
if 'parts' in msg['payload']:
|
60 |
+
attachment_documents = []
|
61 |
for part in msg['payload']['parts']:
|
62 |
if part['filename']:
|
63 |
attachment_id = part['body']['attachmentId']
|
|
|
69 |
with open(path, 'wb') as f:
|
70 |
f.write(file_data)
|
71 |
if part['filename'].endswith('.pdf'):
|
72 |
+
attachment_documents = attachment_documents + PyPDFLoader(path).load()
|
|
|
73 |
if part['filename'].endswith('.png'):
|
74 |
+
attachment_documents = attachment_documents + UnstructuredImageLoader(path).load()
|
|
|
75 |
if part['filename'].endswith('.csv'):
|
76 |
+
attachment_documents = attachment_documents + CSVLoader(path).load()
|
77 |
+
for index, document in enumerate(attachment_documents):
|
|
|
78 |
_id = f"{msg['id']}_{index}"
|
79 |
if 'source' in document.metadata:
|
80 |
document.metadata['source'] = document.metadata['source'].replace(f"./{ATTACHMENTS_DIR}/", "")
|
|
|
103 |
"""
|
104 |
emails = search_emails(query)
|
105 |
if emails:
|
106 |
+
print("Found %d emails:\n", len(emails))
|
107 |
logger.info("Found %d emails after two_weeks_ago:\n", len(emails))
|
108 |
return f"{len(list_emails(emails))} emails added to the collection."
|
109 |
else:
|
retriever/__init__.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
"""Module for retrievers that fetch documents from various sources."""
|
|
|
2 |
from venv import logger
|
3 |
from langchain_core.retrievers import BaseRetriever
|
4 |
from langchain_core.vectorstores import VectorStoreRetriever
|
@@ -23,9 +24,9 @@ class DocRetriever(BaseRetriever):
|
|
23 |
list: A list of Document objects with relevant metadata.
|
24 |
"""
|
25 |
retriever: VectorStoreRetriever = None
|
26 |
-
k: int =
|
27 |
|
28 |
-
def __init__(self, req, k: int =
|
29 |
super().__init__()
|
30 |
# _filter={}
|
31 |
# if req.site != []:
|
@@ -44,24 +45,26 @@ class DocRetriever(BaseRetriever):
|
|
44 |
def _get_relevant_documents(self, query: str, *, run_manager) -> list:
|
45 |
try:
|
46 |
retrieved_docs = self.retriever.invoke(query)
|
47 |
-
doc_lst = []
|
48 |
for doc in retrieved_docs:
|
49 |
# date = str(doc.metadata['publishDate'])
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
63 |
# print(doc_lst)
|
64 |
-
return
|
65 |
except RuntimeError as e:
|
66 |
logger.error("Error retrieving documents: %s", e)
|
67 |
return []
|
|
|
1 |
"""Module for retrievers that fetch documents from various sources."""
|
2 |
+
from importlib import metadata
|
3 |
from venv import logger
|
4 |
from langchain_core.retrievers import BaseRetriever
|
5 |
from langchain_core.vectorstores import VectorStoreRetriever
|
|
|
24 |
list: A list of Document objects with relevant metadata.
|
25 |
"""
|
26 |
retriever: VectorStoreRetriever = None
|
27 |
+
k: int = 10
|
28 |
|
29 |
+
def __init__(self, req, k: int = 10) -> None:
|
30 |
super().__init__()
|
31 |
# _filter={}
|
32 |
# if req.site != []:
|
|
|
45 |
def _get_relevant_documents(self, query: str, *, run_manager) -> list:
|
46 |
try:
|
47 |
retrieved_docs = self.retriever.invoke(query)
|
48 |
+
# doc_lst = []
|
49 |
for doc in retrieved_docs:
|
50 |
# date = str(doc.metadata['publishDate'])
|
51 |
+
doc.metadata['content'] = doc.page_content
|
52 |
+
# doc_lst.append(Document(
|
53 |
+
# page_content = doc.page_content,
|
54 |
+
# metadata = doc.metadata
|
55 |
+
# # metadata = {
|
56 |
+
# # "content": doc.page_content,
|
57 |
+
# # # "id": doc.metadata['id'],
|
58 |
+
# # "title": doc.metadata['subject'],
|
59 |
+
# # # "site": doc.metadata['site'],
|
60 |
+
# # # "link": doc.metadata['link'],
|
61 |
+
# # # "publishDate": doc.metadata['publishDate'].strftime('%Y-%m-%d'),
|
62 |
+
# # # 'web': False,
|
63 |
+
# # # "source": "Finfast"
|
64 |
+
# # }
|
65 |
+
# ))
|
66 |
# print(doc_lst)
|
67 |
+
return retrieved_docs
|
68 |
except RuntimeError as e:
|
69 |
logger.error("Error retrieving documents: %s", e)
|
70 |
return []
|
router/main.py
CHANGED
@@ -37,11 +37,11 @@ async def stream(query: ReqData):
|
|
37 |
# """
|
38 |
# Endpoint to retrieve chat history.
|
39 |
|
40 |
-
# This endpoint handles POST requests to the "/chat/history" URL. It accepts a
|
41 |
# ChatHistory object as input and returns the chat history.
|
42 |
|
43 |
# Args:
|
44 |
-
# chat_history (ChatHistory): The chat history object containing the details
|
45 |
# of the chat to be retrieved.
|
46 |
|
47 |
# Returns:
|
|
|
37 |
# """
|
38 |
# Endpoint to retrieve chat history.
|
39 |
|
40 |
+
# This endpoint handles POST requests to the "/chat/history" URL. It accepts a
|
41 |
# ChatHistory object as input and returns the chat history.
|
42 |
|
43 |
# Args:
|
44 |
+
# chat_history (ChatHistory): The chat history object containing the details
|
45 |
# of the chat to be retrieved.
|
46 |
|
47 |
# Returns:
|
test.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from controllers import mail
|
2 |
+
|
3 |
+
if __name__ == "__main__":
|
4 |
+
mail.collect()
|
token.pickle
CHANGED
Binary files a/token.pickle and b/token.pickle differ
|
|
utils.py
CHANGED
@@ -28,6 +28,7 @@ async def generate(req: ReqData):
|
|
28 |
if 'answer' in chunk:
|
29 |
yield "event: answer\n"
|
30 |
yield f"data: {json.dumps(chunk)}\n\n"
|
|
|
31 |
elif 'context' in chunk:
|
32 |
for context in chunk['context']:
|
33 |
yield "event: context\n"
|
|
|
28 |
if 'answer' in chunk:
|
29 |
yield "event: answer\n"
|
30 |
yield f"data: {json.dumps(chunk)}\n\n"
|
31 |
+
print(chunk['answer'], end="", flush=True)
|
32 |
elif 'context' in chunk:
|
33 |
for context in chunk['context']:
|
34 |
yield "event: context\n"
|