gavinzli commited on
Commit
1031c5b
·
1 Parent(s): c7426d8

Add new test script, update email processing, and adjust retriever settings

Browse files
Files changed (6) hide show
  1. controllers/mail.py +8 -8
  2. retriever/__init__.py +20 -17
  3. router/main.py +2 -2
  4. test.py +4 -0
  5. token.pickle +0 -0
  6. utils.py +1 -0
controllers/mail.py CHANGED
@@ -40,7 +40,7 @@ def list_emails(messages):
40
  """List emails from the search results and download attachments."""
41
  ids = []
42
  documents = []
43
- for message in messages[:100]:
44
  msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()
45
  metadata = {}
46
  for header in msg['payload']['headers']:
@@ -54,8 +54,10 @@ def list_emails(messages):
54
  metadata['cc'] = header['value']
55
  metadata['date'] = datetime.fromtimestamp(
56
  int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
 
57
  body = ""
58
  if 'parts' in msg['payload']:
 
59
  for part in msg['payload']['parts']:
60
  if part['filename']:
61
  attachment_id = part['body']['attachmentId']
@@ -67,15 +69,12 @@ def list_emails(messages):
67
  with open(path, 'wb') as f:
68
  f.write(file_data)
69
  if part['filename'].endswith('.pdf'):
70
- attachment_documents = PyPDFLoader(path).load()
71
- documents = documents + attachment_documents
72
  if part['filename'].endswith('.png'):
73
- attachment_documents = UnstructuredImageLoader(path).load()
74
- documents = documents + attachment_documents
75
  if part['filename'].endswith('.csv'):
76
- attachment_documents = CSVLoader(path).load()
77
- documents = documents + attachment_documents
78
- for index, document in enumerate(documents):
79
  _id = f"{msg['id']}_{index}"
80
  if 'source' in document.metadata:
81
  document.metadata['source'] = document.metadata['source'].replace(f"./{ATTACHMENTS_DIR}/", "")
@@ -104,6 +103,7 @@ def collect(query = (datetime.today() - timedelta(days=21)).strftime('after:%Y/%
104
  """
105
  emails = search_emails(query)
106
  if emails:
 
107
  logger.info("Found %d emails after two_weeks_ago:\n", len(emails))
108
  return f"{len(list_emails(emails))} emails added to the collection."
109
  else:
 
40
  """List emails from the search results and download attachments."""
41
  ids = []
42
  documents = []
43
+ for message in messages:
44
  msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()
45
  metadata = {}
46
  for header in msg['payload']['headers']:
 
54
  metadata['cc'] = header['value']
55
  metadata['date'] = datetime.fromtimestamp(
56
  int(msg['internalDate']) / 1000).strftime("%d/%m/%Y %H:%M:%S")
57
+ logger.info(metadata, msg['id'])
58
  body = ""
59
  if 'parts' in msg['payload']:
60
+ attachment_documents = []
61
  for part in msg['payload']['parts']:
62
  if part['filename']:
63
  attachment_id = part['body']['attachmentId']
 
69
  with open(path, 'wb') as f:
70
  f.write(file_data)
71
  if part['filename'].endswith('.pdf'):
72
+ attachment_documents = attachment_documents + PyPDFLoader(path).load()
 
73
  if part['filename'].endswith('.png'):
74
+ attachment_documents = attachment_documents + UnstructuredImageLoader(path).load()
 
75
  if part['filename'].endswith('.csv'):
76
+ attachment_documents = attachment_documents + CSVLoader(path).load()
77
+ for index, document in enumerate(attachment_documents):
 
78
  _id = f"{msg['id']}_{index}"
79
  if 'source' in document.metadata:
80
  document.metadata['source'] = document.metadata['source'].replace(f"./{ATTACHMENTS_DIR}/", "")
 
103
  """
104
  emails = search_emails(query)
105
  if emails:
106
+ print("Found %d emails:\n", len(emails))
107
  logger.info("Found %d emails after two_weeks_ago:\n", len(emails))
108
  return f"{len(list_emails(emails))} emails added to the collection."
109
  else:
retriever/__init__.py CHANGED
@@ -1,4 +1,5 @@
1
  """Module for retrievers that fetch documents from various sources."""
 
2
  from venv import logger
3
  from langchain_core.retrievers import BaseRetriever
4
  from langchain_core.vectorstores import VectorStoreRetriever
@@ -23,9 +24,9 @@ class DocRetriever(BaseRetriever):
23
  list: A list of Document objects with relevant metadata.
24
  """
25
  retriever: VectorStoreRetriever = None
26
- k: int = 3
27
 
28
- def __init__(self, req, k: int = 3) -> None:
29
  super().__init__()
30
  # _filter={}
31
  # if req.site != []:
@@ -44,24 +45,26 @@ class DocRetriever(BaseRetriever):
44
  def _get_relevant_documents(self, query: str, *, run_manager) -> list:
45
  try:
46
  retrieved_docs = self.retriever.invoke(query)
47
- doc_lst = []
48
  for doc in retrieved_docs:
49
  # date = str(doc.metadata['publishDate'])
50
- doc_lst.append(Document(
51
- page_content = doc.page_content,
52
- metadata = {
53
- "content": doc.page_content,
54
- # "id": doc.metadata['id'],
55
- "title": doc.metadata['subject'],
56
- # "site": doc.metadata['site'],
57
- # "link": doc.metadata['link'],
58
- # "publishDate": doc.metadata['publishDate'].strftime('%Y-%m-%d'),
59
- # 'web': False,
60
- # "source": "Finfast"
61
- }
62
- ))
 
 
63
  # print(doc_lst)
64
- return doc_lst
65
  except RuntimeError as e:
66
  logger.error("Error retrieving documents: %s", e)
67
  return []
 
1
  """Module for retrievers that fetch documents from various sources."""
2
+ from importlib import metadata
3
  from venv import logger
4
  from langchain_core.retrievers import BaseRetriever
5
  from langchain_core.vectorstores import VectorStoreRetriever
 
24
  list: A list of Document objects with relevant metadata.
25
  """
26
  retriever: VectorStoreRetriever = None
27
+ k: int = 10
28
 
29
+ def __init__(self, req, k: int = 10) -> None:
30
  super().__init__()
31
  # _filter={}
32
  # if req.site != []:
 
45
  def _get_relevant_documents(self, query: str, *, run_manager) -> list:
46
  try:
47
  retrieved_docs = self.retriever.invoke(query)
48
+ # doc_lst = []
49
  for doc in retrieved_docs:
50
  # date = str(doc.metadata['publishDate'])
51
+ doc.metadata['content'] = doc.page_content
52
+ # doc_lst.append(Document(
53
+ # page_content = doc.page_content,
54
+ # metadata = doc.metadata
55
+ # # metadata = {
56
+ # # "content": doc.page_content,
57
+ # # # "id": doc.metadata['id'],
58
+ # # "title": doc.metadata['subject'],
59
+ # # # "site": doc.metadata['site'],
60
+ # # # "link": doc.metadata['link'],
61
+ # # # "publishDate": doc.metadata['publishDate'].strftime('%Y-%m-%d'),
62
+ # # # 'web': False,
63
+ # # # "source": "Finfast"
64
+ # # }
65
+ # ))
66
  # print(doc_lst)
67
+ return retrieved_docs
68
  except RuntimeError as e:
69
  logger.error("Error retrieving documents: %s", e)
70
  return []
router/main.py CHANGED
@@ -37,11 +37,11 @@ async def stream(query: ReqData):
37
  # """
38
  # Endpoint to retrieve chat history.
39
 
40
- # This endpoint handles POST requests to the "/chat/history" URL. It accepts a
41
  # ChatHistory object as input and returns the chat history.
42
 
43
  # Args:
44
- # chat_history (ChatHistory): The chat history object containing the details
45
  # of the chat to be retrieved.
46
 
47
  # Returns:
 
37
  # """
38
  # Endpoint to retrieve chat history.
39
 
40
+ # This endpoint handles POST requests to the "/chat/history" URL. It accepts a
41
  # ChatHistory object as input and returns the chat history.
42
 
43
  # Args:
44
+ # chat_history (ChatHistory): The chat history object containing the details
45
  # of the chat to be retrieved.
46
 
47
  # Returns:
test.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from controllers import mail
2
+
3
+ if __name__ == "__main__":
4
+ mail.collect()
token.pickle CHANGED
Binary files a/token.pickle and b/token.pickle differ
 
utils.py CHANGED
@@ -28,6 +28,7 @@ async def generate(req: ReqData):
28
  if 'answer' in chunk:
29
  yield "event: answer\n"
30
  yield f"data: {json.dumps(chunk)}\n\n"
 
31
  elif 'context' in chunk:
32
  for context in chunk['context']:
33
  yield "event: context\n"
 
28
  if 'answer' in chunk:
29
  yield "event: answer\n"
30
  yield f"data: {json.dumps(chunk)}\n\n"
31
+ print(chunk['answer'], end="", flush=True)
32
  elif 'context' in chunk:
33
  for context in chunk['context']:
34
  yield "event: context\n"