Spaces:

Oxbridge-Economics
/

knowledge-base

Running

App Files Files Community

gavinzli commited on 7 days ago

Commit

af61c79

1 Parent(s): a111c12

Refactor email handling: remove mail module, update service routes, and enhance EmailQuery model with additional parameters

Browse files

Files changed (11) hide show

.gitignore +1 -0
app/controllers/__init__.py +0 -0
app/controllers/mail.py +0 -318
app/main.py +2 -2
app/models/db/__init__.py +7 -5
app/models/llm/__init__.py +1 -82
app/router/mail.py +0 -75
app/router/service.py +40 -0
app/schema/__init__.py +44 -5
app/services/__init__.py +6 -0
app/services/gmail.py +322 -0

.gitignore CHANGED Viewed

@@ -183,3 +183,4 @@ models/chroma/data/*.bin
 models/chroma/_data/chroma.sqlite3
 models/chroma/data/chroma.sqlite3
 cache

 models/chroma/_data/chroma.sqlite3
 models/chroma/data/chroma.sqlite3
 cache
+_cache

app/controllers/__init__.py DELETED Viewed

File without changes

app/controllers/mail.py DELETED Viewed

@@ -1,318 +0,0 @@
-"""Module to search and list emails from Gmail."""
-import base64
-import hashlib
-import os
-import re
-from datetime import datetime, timedelta
-from venv import logger
-from ics import Calendar
-from langchain_community.document_loaders import (
-    CSVLoader,
-    PyPDFLoader,
-    UnstructuredExcelLoader,
-    UnstructuredImageLoader,
-)
-from langchain_core.documents import Document
-from models.db import vectorstore
-SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
-EMAIL_PATTERN = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
-ATTACHMENTS_DIR = "cache"
-os.makedirs(ATTACHMENTS_DIR, exist_ok=True)
-def build_query(params):
-    """
-    Constructs a query string based on the provided parameters.
-    Args:
-        params (dict): A dictionary containing optional query parameters.
-            Supported keys include:
-                - 'subject' (str): The subject of the email.
-                - 'from' (str): The sender's email address.
-                - 'to' (str): The recipient's email address.
-                - 'cc' (str): The CC recipient's email address.
-                - 'after' (str): A date string to filter emails sent after this date.
-                - 'before' (str): A date string to filter emails sent before this date.
-    Returns:
-        str: A query string constructed from the provided parameters. Each parameter
-        is formatted as a key-value pair and joined by spaces. If a parameter is not
-        provided or is empty, it is excluded from the query string.
-    """
-    query_parts = []
-    if 'subject' in params and params['subject']:
-        query_parts.append(f'subject:"{params["subject"]}"')
-    if 'from' in params and params['from']:
-        query_parts.append(f'from:{params["from"]}')
-    if 'to' in params and params['to']:
-        query_parts.append(f'to:{params["to"]}')
-    if 'cc' in params and params['cc']:
-        query_parts.append(f'cc:{params["cc"]}')
-    if 'after' in params and params['after']:
-        query_parts.append(f'after:{params["after"]}')
-    if 'before' in params and params['before']:
-        query_parts.append(f'before:{params["before"]}')
-    return ' '.join(query_parts)
-def search_emails(service, query):
-    """Search emails based on a query."""
-    result = service.users().messages().list(userId="me", q=query).execute()
-    messages = []
-    if "messages" in result:
-        messages.extend(result["messages"])
-    while "nextPageToken" in result:
-        page_token = result["nextPageToken"]
-        result = (
-            service.users().messages().list(userId="me", q=query, pageToken=page_token).execute()
-        )
-        if "messages" in result:
-            messages.extend(result["messages"])
-    return messages
-def list_emails(service, messages):
-    """
-    Processes a list of email messages, extracts metadata, decodes content, and handles attachments.
-    Args:
-        messages (list): A list of email message dictionaries, where each dictionary contains
-                        at least an 'id' key representing the email's unique identifier.
-    Returns:
-        None: The function processes the emails and adds the extracted documents to a vector store.
-    Functionality:
-        - Retrieves email details using the Gmail API.
-        - Extracts metadata such as sender, recipient, subject, CC, and date.
-        - Decodes email content in plain text or HTML format.
-        - Handles multipart emails, including attachments.
-        - Processes attachments based on their MIME type:
-            - PDF files are loaded using PyPDFLoader.
-            - Images (PNG, JPEG) are loaded using UnstructuredImageLoader.
-            - CSV files are loaded using CSVLoader.
-            - Excel files are loaded using UnstructuredExcelLoader.
-            - Calendar files (ICS) are parsed to extract event details.
-        - Removes HTML tags from email content.
-        - Stores processed documents and metadata in a vector store.
-        - Deletes temporary files created during attachment processing.
-    Notes:
-        - The function assumes the existence of a global `service` object for Gmail API.
-        - The `vectorstore.add_documents` method is used to store the processed documents.
-        - Attachments are temporarily saved in `ATTACHMENTS_DIR` and deleted after processing.
-        - The function logs information about attachments being downloaded.
-    """
-    ids = []
-    documents = []
-    for message in messages:
-        msg = service.users().messages().get(userId="me", id=message["id"], format="full").execute()
-        metadata = {}
-        metadata["threadId"] = msg["threadId"]
-        metadata["msgId"] = msg["id"]
-        msgId = f"{msg['threadId']}-{msg['id']}"
-        for header in msg["payload"]["headers"]:
-            if header["name"] == "From":
-                metadata["from"] = header["value"]
-            elif header["name"] == "To":
-                metadata["to"] = header["value"]
-            elif header["name"] == "Subject":
-                metadata["subject"] = header["value"]
-                logger.info("subject: %s", metadata["subject"])
-            elif header["name"] == "Cc":
-                metadata["cc"] = header["value"]
-        metadata["date"] = datetime.fromtimestamp(int(msg["internalDate"]) / 1000).strftime(
-            "%d/%m/%Y %H:%M:%S"
-        )
-        metadata["userId"] = service.users().getProfile(userId="me").execute().get("emailAddress")
-        ids = []
-        documents = []
-        mime_types = []
-        if msg["payload"]["mimeType"] in [
-            "multipart/alternative",
-            "multipart/related",
-            "multipart/mixed",
-        ]:
-            mime_types = []
-            attach_docs = []
-            for part in msg["payload"]["parts"]:
-                print("mimeType: ", part["mimeType"])
-                mime_types.append(part["mimeType"])
-                if part["mimeType"] == "text/plain" and "text/html" not in mime_types:
-                    body = base64.urlsafe_b64decode(part["body"]["data"]).decode("utf-8")
-                    body = re.sub(r"<[^>]+>", "", body)  # Remove HTML tags
-                    metadata["mimeType"] = part["mimeType"]
-                    documents.append(Document(page_content=body, metadata=metadata))
-                    ids.append(msg["id"])
-                elif part["mimeType"] == "text/html" and "text/plain" not in mime_types:
-                    body = base64.urlsafe_b64decode(part["body"]["data"]).decode("utf-8")
-                    body = re.sub(r"<[^>]+>", "", body)
-                    metadata["mimeType"] = part["mimeType"]
-                    documents.append(Document(page_content=body, metadata=metadata))
-                    ids.append(msg["id"])
-                if part["filename"]:
-                    attachment_id = part["body"]["attachmentId"]
-                    logger.info("Downloading attachment: %s", part["filename"])
-                    attachment = (
-                        service.users()
-                        .messages()
-                        .attachments()
-                        .get(userId="me", messageId=message["id"], id=attachment_id)
-                        .execute()
-                    )
-                    file_data = base64.urlsafe_b64decode(attachment["data"].encode("UTF-8"))
-                    path = os.path.join(".", ATTACHMENTS_DIR, part["filename"])
-                    with open(path, "wb") as f:
-                        f.write(file_data)
-                    if part["mimeType"] == "application/pdf":
-                        attach_docs = PyPDFLoader(path).load()
-                    elif part["mimeType"] == "image/png" or part["mimeType"] == "image/jpeg":
-                        try:
-                            attach_docs = UnstructuredImageLoader(path).load()
-                        except Exception as e:
-                            logger.error("Error loading image: %s", e)
-                    elif part["filename"].endswith(".csv"):
-                        attach_docs = CSVLoader(path).load()
-                    elif (
-                        part["mimeType"]
-                        == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
-                    ):
-                        attach_docs = UnstructuredExcelLoader(path).load()
-                    elif part["mimeType"] == "application/ics":
-                        with open(path, "r", encoding="utf-8") as f:
-                            calendar = Calendar(f.read())
-                        for event in calendar.events:
-                            documents.append(
-                                Document(
-                                    page_content=f"Event: {event.name}\n\Description: {event.description}\nStart: {event.begin}\nEnd: {event.end}",
-                                    metadata={
-                                        "attachment": part["filename"],
-                                        "mimeType": part["mimeType"],
-                                        "location": event.location,
-                                        "created": event.created.strftime("%d/%m/%Y %H:%M:%S"),
-                                        "last_modified": event.last_modified.strftime(
-                                            "%d/%m/%Y %H:%M:%S"
-                                        ),
-                                        "start": event.begin.strftime("%d/%m/%Y %H:%M:%S"),
-                                        "end": event.end.strftime("%d/%m/%Y %H:%M:%S"),
-                                    },
-                                )
-                            )
-                            ids.append(f"{msgId}-{part['filename']}-{hashlib.sha256(file_data).hexdigest()}")
-                    if os.path.exists(path):
-                        os.remove(path)
-                    for index, document in enumerate(attach_docs or []):
-                        document.metadata["mimeType"] = part["mimeType"]
-                        if "page_label" in document.metadata:
-                            document.metadata["page"] = document.metadata["page_label"]
-                        document.metadata["attachment"] = part["filename"]
-                        document.metadata = {
-                            key: value
-                            for key, value in document.metadata.items()
-                            if key in ["attachment", "page"]
-                        }
-                        document.metadata.update(metadata)
-                        documents.append(document)
-                        ids.append(f"{msgId}-{hashlib.sha256(file_data).hexdigest()}-{index}")
-        elif msg["payload"]["mimeType"] == "text/plain" and "data" in msg["payload"]["body"]:
-            body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
-            body = re.sub(r"<[^>]+>", "", body)
-            metadata["mimeType"] = msg["payload"]["mimeType"]
-            documents.append(Document(page_content=body, metadata=metadata))
-            ids.append(msgId)
-        elif msg["payload"]["mimeType"] == "text/html" and "data" in msg["payload"]["body"]:
-            body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
-            body = re.sub(r"<[^>]+>", "", body)
-            metadata["mimeType"] = msg["payload"]["mimeType"]
-            documents.append(Document(page_content=body, metadata=metadata))
-            ids.append(msgId)
-        if "multipart/alternative" in mime_types and len(mime_types) == 1:
-            print("Only multipart/alternative found in the email.")
-        else:
-            try:
-                vectorstore.add_documents(documents=documents, ids=ids)
-            except Exception as e:
-                logger.error("Error adding documents to vectorstore: %s", e)
-def collect(service, query=(datetime.today() - timedelta(days=10)).strftime("after:%Y/%m/%d")):
-    """
-    Main function to search and list emails from Gmail.
-    This function builds a Gmail service, constructs a query to search for emails
-    received in the last 14 days, and lists the found emails. If no emails are found,
-    it prints a message indicating so.
-    Returns:
-        None
-    """
-    # query = "subject:Re: Smartcareers algorithm debug and improvement'"
-    emails = search_emails(service, query)
-    if emails:
-        logger.info("Found %d emails:\n", len(emails))
-        logger.info("Found %d emails after two_weeks_ago:\n", len(emails))
-        list_emails(service, emails)
-        logger.info("Listing emails...")
-        return f"{len(emails)} emails added to the collection."
-    else:
-        logger.info("No emails found after two weeks ago.")
-def get_emails(service, query, max_results=10):
-    """
-    Retrieve a list of emails with subject, to, from, cc, and content.
-    Args:
-        mailservice: Authenticated Gmail API service instance
-        max_results: Maximum number of emails to retrieve
-    Returns:
-        List of dictionaries containing email details
-    """
-    try:
-        # List messages
-        query = build_query(query.dict())
-        response = service.users().messages().list(
-            userId='me', q=query, maxResults=max_results).execute()
-        messages = response.get('messages', [])
-        email_list = []
-        if not messages:
-            return email_list
-        for message in messages:
-            msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()
-            headers = msg['payload']['headers']
-            email_data = {
-                'subject': '',
-                'from': '',
-                'to': '',
-                'cc': '',
-                'content': '',
-                'snippet': msg['snippet'] if 'snippet' in msg else '',
-            }
-            for header in headers:
-                name = header['name'].lower()
-                if name == 'subject':
-                    email_data['subject'] = header['value']
-                elif name == 'from':
-                    email_data['from'] = header['value']
-                elif name == 'to':
-                    email_data['to'] = header['value']
-                elif name == 'cc':
-                    email_data['cc'] = header['value']
-            if 'parts' in msg['payload']:
-                for part in msg['payload']['parts']:
-                    if part['mimeType'] == 'text/plain':
-                        email_data['content'] = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
-                        break
-                    elif part['mimeType'] == 'text/html':
-                        email_data['content'] = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
-                        break
-            elif 'data' in msg['payload']['body']:
-                email_data['content'] = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8')
-            email_list.append(email_data)
-        return email_list
-    except Exception as e:
-        print(f"An error occurred: {e}")
-        return []

app/main.py CHANGED Viewed

@@ -4,7 +4,7 @@ import logging
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from jose import jwt
-from router import auth, content, mail
 from starlette.middleware.base import BaseHTTPMiddleware
 SECRET_KEY = "your-secret-key"
@@ -65,7 +65,7 @@ logging.getLogger().setLevel(logging.INFO)
 app = FastAPI(docs_url="/")
 app.include_router(content.router)
-app.include_router(mail.router)
 app.include_router(auth.router)
 origins = [

 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from jose import jwt
+from router import auth, content, service
 from starlette.middleware.base import BaseHTTPMiddleware
 SECRET_KEY = "your-secret-key"
 app = FastAPI(docs_url="/")
 app.include_router(content.router)
+app.include_router(service.router)
 app.include_router(auth.router)
 origins = [

app/models/db/__init__.py CHANGED Viewed

@@ -1,12 +1,14 @@
 """This module is responsible for initializing the database connection and creating the necessary tables."""
 from pinecone import Pinecone, ServerlessSpec
 from langchain_pinecone import PineconeVectorStore
-from models.llm import EmbeddingsModel
-embeddings = EmbeddingsModel("all-MiniLM-L6-v2")
 pc = Pinecone()
-INDEX_NAME = "mails"
 if not pc.has_index(INDEX_NAME):
     pc.create_index(
         name=INDEX_NAME,
@@ -15,7 +17,7 @@ if not pc.has_index(INDEX_NAME):
         spec=ServerlessSpec(
             cloud="aws",
             region="us-east-1"
-        )
     )
 index = pc.Index(INDEX_NAME)
-vectorstore = PineconeVectorStore(index=index, embedding=embeddings)

 """This module is responsible for initializing the database connection and creating the necessary tables."""
 from pinecone import Pinecone, ServerlessSpec
 from langchain_pinecone import PineconeVectorStore
+# from torch import embedding
+from models.llm import GPTEmbeddings
+# embeddings = EmbeddingsModel("all-MiniLM-L6-v2")
+embeddings = GPTEmbeddings()
 pc = Pinecone()
+INDEX_NAME = "gmails"
 if not pc.has_index(INDEX_NAME):
     pc.create_index(
         name=INDEX_NAME,
         spec=ServerlessSpec(
             cloud="aws",
             region="us-east-1"
+        )
     )
 index = pc.Index(INDEX_NAME)
+vectorstore = PineconeVectorStore(index=index, embedding=embeddings)

app/models/llm/__init__.py CHANGED Viewed

@@ -1,13 +1,8 @@
 """Module for OpenAI model and embeddings."""
-# import os
 from typing import List
-# import onnxruntime as ort
 from langchain.embeddings.base import Embeddings
 from sentence_transformers import SentenceTransformer
 from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
-# from langchain_huggingface import HuggingFacePipeline
-# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-# from huggingface_hub import hf_hub_download
 class GPTModel(AzureChatOpenAI):
     """
@@ -40,67 +35,6 @@ class GPTEmbeddings(AzureOpenAIEmbeddings):
         Inherits all methods from AzureOpenAIEmbeddings.
     """
-# class Phi4MiniONNXLLM:
-#     """
-#     A class for interfacing with a pre-trained ONNX model for inference.
-#     Attributes:
-#         session (onnxruntime.InferenceSession): The ONNX runtime inference session for the model.
-#         input_name (str): The name of the input node in the ONNX model.
-#         output_name (str): The name of the output node in the ONNX model.
-#     Methods:
-#         __init__(model_path):
-#             Initializes the Phi4MiniONNXLLM instance by loading the ONNX model from specified path.
-#         __call__(input_ids):
-#             Performs inference on the given input data and returns the model's output.
-#     """
-#     def __init__(self, repo_id, subfolder, onnx_file="model.onnx", weights_file="model.onnx.data"):
-#         self.repo_id = repo_id
-#         model_path = hf_hub_download(repo_id=repo_id, filename=f"{subfolder}/{onnx_file}")
-#         weights_path = hf_hub_download(repo_id=repo_id, filename=f"{subfolder}/{weights_file}")
-#         self.session = ort.InferenceSession(model_path)
-#         # Verify both files exist
-#         print(f"Model path: {model_path}, Exists: {os.path.exists(model_path)}")
-#         print(f"Weights path: {weights_path}, Exists: {os.path.exists(weights_path)}")
-#         self.input_name = self.session.get_inputs()[0].name
-#         self.output_name = self.session.get_outputs()[0].name
-#     def __call__(self, input_text):
-#         # Assuming input_ids is a tensor or numpy array
-#         tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4-mini-instruct-onnx")
-#         inputs = tokenizer(input_text, return_tensors="pt")
-#         input_feed = {
-#             self.input_name: inputs["input_ids"].numpy(),
-#             "attention_mask": inputs["attention_mask"].numpy(),
-#             # Add past_key_values if applicable
-#         }
-#         outputs = self.session.run([self.output_name], input_feed)
-#         return outputs
-# class HuggingfaceModel(HuggingFacePipeline):
-#     """
-#     HuggingfaceModel is a wrapper class for the Hugging Face text-generation pipeline.
-#     Attributes:
-#         name (str): The name or path of the pre-trained model to load from Hugging Face.
-#         max_tokens (int): The maximum number of new tokens to generate in the text output.
-#         Defaults to 200.
-#     Methods:
-#         __init__(name, max_tokens=200):
-#             Initializes the HuggingfaceModel with the specified model name and maximum token limit.
-#     """
-#     def __init__(self, name, max_tokens=500):
-#         super().__init__(pipeline=pipeline(
-#             "text-generation",
-#             model=AutoModelForCausalLM.from_pretrained(name),
-#             tokenizer=AutoTokenizer.from_pretrained(name),
-#             max_new_tokens=max_tokens
-#             )
-#         )
 class EmbeddingsModel(Embeddings):
     """
     A model for generating embeddings using SentenceTransformer.
@@ -113,7 +47,7 @@ class EmbeddingsModel(Embeddings):
         Initializes the Chroma model with the specified model name.
         Args:
-            model_name (str): The name of the model to be used for sentence transformation.
         """
         self.model = SentenceTransformer(model_name)
@@ -140,18 +74,3 @@ class EmbeddingsModel(Embeddings):
             List[float]: The embedded representation of the query as a list of floats.
         """
         return self.model.encode([query]).tolist()[0]
-# model_name = "microsoft/phi-1_5"
-# tokenizer = AutoTokenizer.from_pretrained(model_name)
-# model = AutoModelForCausalLM.from_pretrained(model_name)
-# pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)
-# phi4_llm = HuggingFacePipeline(pipeline=pipe)
-# tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", pad_token_id=50256)
-# model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
-# pipe = pipeline(
-#     "text-generation", model=model, tokenizer=tokenizer,
-#       max_new_tokens=10, truncation=True,  # Truncate input sequences
-# )
-# phi4_llm = HuggingFacePipeline(pipeline=pipe)

 """Module for OpenAI model and embeddings."""
 from typing import List
 from langchain.embeddings.base import Embeddings
 from sentence_transformers import SentenceTransformer
 from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
 class GPTModel(AzureChatOpenAI):
     """
         Inherits all methods from AzureOpenAIEmbeddings.
     """
 class EmbeddingsModel(Embeddings):
     """
     A model for generating embeddings using SentenceTransformer.
         Initializes the Chroma model with the specified model name.
         Args:
+            model_name (str): The name of the model to be used for embedding.
         """
         self.model = SentenceTransformer(model_name)
             List[float]: The embedded representation of the query as a list of floats.
         """
         return self.model.encode([query]).tolist()[0]

app/router/mail.py DELETED Viewed

@@ -1,75 +0,0 @@
-"""Module for defining the main routes of the API."""
-import os
-import pickle
-import threading
-from venv import logger
-from fastapi import APIRouter, Request
-from fastapi.responses import JSONResponse
-from controllers import mail
-from google.oauth2.credentials import Credentials
-from googleapiclient.discovery import build
-from schema import MailReqData
-router = APIRouter(prefix="/mail", tags=["mail"])
-@router.post("")
-def collect(query: MailReqData, request: Request):
-    """
-    Handles the chat POST request.
-    Args:
-        query (ReqData): The request data containing the query parameters.
-    Returns:
-        str: The generated response from the chat function.
-    """
-    try:
-        if os.path.exists(f"cache/{query.email}.pickle"):
-            with open(f"cache/{query.email}.pickle", "rb") as token:
-                credentials = pickle.load(token)
-        else:
-            cred_dict = request.state.session.get("credential")
-            credentials = Credentials(
-                token=cred_dict["token"],
-                refresh_token=cred_dict["refresh_token"],
-                token_uri=cred_dict["token_uri"],
-                client_id=cred_dict["client_id"],
-                client_secret=cred_dict["client_secret"],
-                scopes=cred_dict["scopes"],
-            )
-        mailservice = build("gmail", "v1", credentials=credentials)
-        threading.Thread(target=mail.collect, args=(mailservice, query.query)).start()
-        return JSONResponse(content={"message": "Mail collection in progress."})
-    except Exception as e:
-        logger.error("Error collecting mail: %s", e)
-        return JSONResponse(content={"error": str(e)}, status_code=500)
-@router.get("")
-def get(query: MailReqData, request: Request):
-    """
-    Handles the chat POST request.
-    Args:
-        query (ReqData): The request data containing the query parameters.
-    Returns:
-        str: The generated response from the chat function.
-    """
-    if os.path.exists(f"cache/{query.email}.pickle"):
-            with open(f"cache/{query.email}.pickle", "rb") as token:
-                credentials = pickle.load(token)
-    else:
-        cred_dict = request.state.session.get("credential")
-        credentials = Credentials(
-            token=cred_dict["token"],
-            refresh_token=cred_dict["refresh_token"],
-            token_uri=cred_dict["token_uri"],
-            client_id=cred_dict["client_id"],
-            client_secret=cred_dict["client_secret"],
-            scopes=cred_dict["scopes"],
-        )
-    mailservice = build("gmail", "v1", credentials=credentials)
-    result = mail.get_emails(mailservice, query.query, query.query.max_results)
-    return JSONResponse(content= result)

app/router/service.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""Module for defining the main routes of the API."""
+import threading
+from fastapi import APIRouter, Request
+from fastapi.responses import JSONResponse
+from services import GmailService
+from schema import EmailQuery
+router = APIRouter(prefix="/service", tags=["mail"])
+@router.post("/gmail")
+def collect(query: EmailQuery, request: Request) -> JSONResponse:
+    """
+    Handles the chat POST request.
+    Args:
+        query (ReqData): The request data containing the query parameters.
+    Returns:
+        str: The generated response from the chat function.
+    """
+    service = GmailService(request.headers.get("Google-Token"))
+    threading.Thread(target=service.collect, args=[query]).start()
+    return JSONResponse(content={"message": "Mail collection in progress."})
+@router.get("/gmail")
+def get(query: EmailQuery, request: Request) -> JSONResponse:
+    """
+    Handles the chat POST request.
+    Args:
+        query (ReqData): The request data containing the query parameters.
+    Returns:
+        str: The generated response from the chat function.
+    """
+    service = GmailService(request.headers.get("Google-Token"))
+    result = service.get(query, query.max_results)
+    return JSONResponse(content = result)

app/schema/__init__.py CHANGED Viewed

@@ -1,7 +1,10 @@
 """Module containing the data models for the application."""
-from typing import Optional, List
 from pydantic import BaseModel, Field
 class EmailQuery(BaseModel):
     """
     EmailQuery model representing the structure of an email query.
@@ -11,14 +14,52 @@ class EmailQuery(BaseModel):
         from_email (Optional[str]): The sender's email address.
         to_email (Optional[str]): The recipient's email address.
         cc_email (Optional[str]): The CC email address.
         after (Optional[str]): The date after which to search for emails.
         max_results (Optional[int]): The maximum number of results to return.
     """
-    subject: Optional[str]
     from_email: Optional[str] = Field(None, alias="from")
     to_email: Optional[str] = Field(None, alias="to")
     cc_email: Optional[str] = Field(None, alias="cc")
-    after: Optional[str]
     max_results: Optional[int] = 10
 class ReqData(BaseModel):
@@ -43,10 +84,8 @@ class MailReqData(BaseModel):
     MailReqData is a data model representing the structure of a mail request.
     Attributes:
-        email (str): The email address of the sender.
         query (str): The query or message content sent by the user.
     """
-    email: str
     query: EmailQuery
 class ReqFollowUp(BaseModel):

 """Module containing the data models for the application."""
+from datetime import datetime, timedelta
+from typing import List, Optional
 from pydantic import BaseModel, Field
 class EmailQuery(BaseModel):
     """
     EmailQuery model representing the structure of an email query.
         from_email (Optional[str]): The sender's email address.
         to_email (Optional[str]): The recipient's email address.
         cc_email (Optional[str]): The CC email address.
+        has_words (Optional[str]): Words that the email must contain.
+        not_has_words (Optional[str]): Words that the email must not contain.
+        size (Optional[int]): The size of the email in bytes.
+        date_within (Optional[str]): The date within which to search for emails.
         after (Optional[str]): The date after which to search for emails.
         max_results (Optional[int]): The maximum number of results to return.
     """
+    subject: Optional[str] = None
     from_email: Optional[str] = Field(None, alias="from")
     to_email: Optional[str] = Field(None, alias="to")
     cc_email: Optional[str] = Field(None, alias="cc")
+    has_words: Optional[str] = None
+    not_has_words: Optional[str] = None
+    size: Optional[int] = None
+    before: Optional[str] = None
+    after: Optional[str] = None
+    @classmethod
+    def validate_before_after(
+        cls, before: Optional[str], after: Optional[str]) -> tuple[Optional[str], Optional[str]]:
+        """
+        Validates and adjusts the 'before' and 'after' date parameters.
+        This method ensures that the 'before' date is greater than the 'after' date.
+        If 'before' is not provided, it defaults to six months prior to the current date.
+        Args:
+            before (Optional[str]): The 'before' date in the format "YYYY/MM/DD". Defaults to None.
+            after (Optional[str]): The 'after' date in the format "YYYY/MM/DD". Defaults to None.
+        Returns:
+            tuple[Optional[str], Optional[str]]:
+            A tuple containing the validated 'before' and 'after' dates.
+        Raises:
+            ValueError: If the 'before' date is not greater than the 'after' date.
+        """
+        if after is None:
+            after = (datetime.now() - timedelta(days=6 * 30)).strftime("%Y/%m/%d")
+        if before and before >= after:
+            raise ValueError("The 'before' date must be greater than the 'after' date.")
+        return before, after
+    def __init__(self, **data):
+        super().__init__(**data)
+        self.before, self.after = self.validate_before_after(self.before, self.after)
     max_results: Optional[int] = 10
 class ReqData(BaseModel):
     MailReqData is a data model representing the structure of a mail request.
     Attributes:
         query (str): The query or message content sent by the user.
     """
     query: EmailQuery
 class ReqFollowUp(BaseModel):

app/services/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Module for database operations."""
+from services.gmail import GmailService
+__all__ = [
+    'GmailService'
+]

app/services/gmail.py ADDED Viewed

	@@ -0,0 +1,322 @@

+"""
+This module provides a utility class, `GmailService`, for interacting with the Gmail API.
+"""
+import base64
+import hashlib
+import os
+import re
+from datetime import datetime
+from venv import logger
+from google.oauth2.credentials import Credentials
+from googleapiclient.discovery import build
+from ics import Calendar
+from langchain_community.document_loaders import (
+    CSVLoader,
+    PyPDFLoader,
+    UnstructuredExcelLoader,
+    UnstructuredImageLoader,
+)
+from langchain_core.documents import Document
+from models.db import vectorstore
+SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
+EMAIL_PATTERN = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
+ATTACHMENTS_DIR = "cache"
+os.makedirs(ATTACHMENTS_DIR, exist_ok=True)
+class GmailService():
+    """
+    GmailService is a utility class for interacting with the Gmail API. It provides methods to
+    construct query strings, search for emails, and retrieve detailed email information.
+    Methods:
+        __init__(token):
+            Initializes the GmailService instance with an authenticated Gmail API service.
+        build_query(params):
+            Constructs a query string based on the provided parameters for filtering emails.
+        search(query, max_results=10, check_next_page=False):
+            Searches for emails based on a query string and returns a list of message metadata.
+        get_emails(query, max_results=10):
+            Retrieves a list of emails with detailed information such as subject, sender,
+            recipients, and content.
+    Attributes:
+        service:
+            An authenticated Gmail API service instance used to interact with the Gmail API.
+    """
+    def __init__(self, token):
+        """
+        Initializes the Gmail controller with the provided token.
+        Args:
+            token (str): The auth token used to create credentials for accessing the Gmail API.
+        """
+        self.service = build("gmail", "v1", credentials=Credentials(token=token))
+    def parse_query(self, params) -> str:
+        """
+        Constructs a query string based on the provided parameters.
+        Args:
+            params (dict): A dictionary containing optional query parameters.
+                Supported keys include:
+                    - 'subject' (str): The subject of the email.
+                    - 'from' (str): The sender's email address.
+                    - 'to' (str): The recipient's email address.
+                    - 'cc' (str): The CC recipient's email address.
+                    - 'after' (str): A date string to filter emails sent after this date.
+                    - 'before' (str): A date string to filter emails sent before this date.
+        Returns:
+            str: A query string constructed from the provided parameters. Each parameter
+            is formatted as a key-value pair and joined by spaces. If a parameter is not
+            provided or is empty, it is excluded from the query string.
+        """
+        query_parts = []
+        if 'subject' in params and params['subject']:
+            query_parts.append(f'subject:({params["subject"]})')
+        if 'from_email' in params and params['from_email']:
+            query_parts.append(f'from:({params["from_email"]})')
+        if 'to_email' in params and params['to_email']:
+            query_parts.append(f'to:({params["to_email"]})')
+        if 'cc_email' in params and params['cc_email']:
+            query_parts.append(f'cc:({params["cc_email"]})')
+        if 'after' in params and params['after']:
+            query_parts.append(f'after:{params["after"]}')
+        if 'before' in params and params['before']:
+            query_parts.append(f'before:{params["before"]}')
+        return ' '.join(query_parts)
+    def collect(self, query):
+        """
+        Main function to search and list emails from Gmail.
+        This function builds a Gmail service, constructs a query to search for emails
+        received in the last 14 days, and lists the found emails. If no emails are found,
+        it prints a message indicating so.
+        Returns:
+            None
+        """
+        ids = []
+        documents = []
+        for message in self.search(query):
+            msg = self.service.users().messages().get(
+                userId="me", id=message["id"], format="full").execute()
+            metadata = {}
+            metadata["threadId"] = msg["threadId"]
+            metadata["msgId"] = msg["id"]
+            msg_id = f"{msg['threadId']}-{msg['id']}"
+            for header in msg["payload"]["headers"]:
+                if header["name"] == "From":
+                    metadata["from"] = header["value"]
+                elif header["name"] == "To":
+                    metadata["to"] = header["value"]
+                elif header["name"] == "Subject":
+                    metadata["subject"] = header["value"]
+                    logger.info("subject: %s", metadata["subject"])
+                elif header["name"] == "Cc":
+                    metadata["cc"] = header["value"]
+            metadata["date"] = datetime.fromtimestamp(int(msg["internalDate"]) / 1000).strftime(
+                "%d/%m/%Y %H:%M:%S"
+            )
+            metadata["userId"] = self.service.users().getProfile(
+                userId="me").execute().get("emailAddress")
+            ids = []
+            documents = []
+            mime_types = []
+            if msg["payload"]["mimeType"] in [
+                "multipart/alternative",
+                "multipart/related",
+                "multipart/mixed",
+            ]:
+                mime_types = []
+                attach_docs = []
+                for part in msg["payload"]["parts"]:
+                    mime_types.append(part["mimeType"])
+                    if part["mimeType"] == "text/plain" and "text/html" not in mime_types:
+                        body = base64.urlsafe_b64decode(part["body"]["data"]).decode("utf-8")
+                        body = re.sub(r"<[^>]+>", "", body)  # Remove HTML tags
+                        metadata["mimeType"] = part["mimeType"]
+                        documents.append(Document(page_content=body, metadata=metadata))
+                        ids.append(msg["id"])
+                    elif part["mimeType"] == "text/html" and "text/plain" not in mime_types:
+                        body = base64.urlsafe_b64decode(part["body"]["data"]).decode("utf-8")
+                        body = re.sub(r"<[^>]+>", "", body)
+                        metadata["mimeType"] = part["mimeType"]
+                        documents.append(Document(page_content=body, metadata=metadata))
+                        ids.append(msg["id"])
+                    if part["filename"]:
+                        attachment_id = part["body"]["attachmentId"]
+                        logger.info("Downloading attachment: %s", part["filename"])
+                        attachment = (
+                            self.service.users()
+                            .messages()
+                            .attachments()
+                            .get(userId="me", messageId=message["id"], id=attachment_id)
+                            .execute()
+                        )
+                        file_data = base64.urlsafe_b64decode(attachment["data"].encode("UTF-8"))
+                        path = os.path.join(".", ATTACHMENTS_DIR, part["filename"])
+                        with open(path, "wb") as f:
+                            f.write(file_data)
+                        if part["mimeType"] == "application/pdf":
+                            attach_docs = PyPDFLoader(path).load()
+                        elif part["mimeType"] == "image/png" or part["mimeType"] == "image/jpeg":
+                            try:
+                                attach_docs = UnstructuredImageLoader(path).load()
+                            except ValueError as e:  # Replace with the specific exception type
+                                logger.error("Error loading image: %s", e)
+                        elif part["filename"].endswith(".csv"):
+                            attach_docs = CSVLoader(path).load()
+                        elif (
+                            part["mimeType"]
+                            == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+                        ):
+                            attach_docs = UnstructuredExcelLoader(path).load()
+                        elif part["mimeType"] == "application/ics":
+                            with open(path, "r", encoding="utf-8") as f:
+                                calendar = Calendar(f.read())
+                            for event in calendar.events:
+                                documents.append(
+                                    Document(
+                                        page_content=f"Event: {event.name}\n\Description: {event.description}\nStart: {event.begin}\nEnd: {event.end}",
+                                        metadata={
+                                            "attachment": part["filename"],
+                                            "mimeType": part["mimeType"],
+                                            "location": event.location,
+                                            "created": event.created.strftime("%d/%m/%Y %H:%M:%S"),
+                                            "last_modified": event.last_modified.strftime(
+                                                "%d/%m/%Y %H:%M:%S"
+                                            ),
+                                            "start": event.begin.strftime("%d/%m/%Y %H:%M:%S"),
+                                            "end": event.end.strftime("%d/%m/%Y %H:%M:%S"),
+                                        },
+                                    )
+                                )
+                                ids.append(
+                                    f"{msg_id}-{part['filename']}-{hashlib.sha256(file_data).hexdigest()}")
+                        if os.path.exists(path):
+                            os.remove(path)
+                        for index, document in enumerate(attach_docs or []):
+                            document.metadata["mimeType"] = part["mimeType"]
+                            if "page_label" in document.metadata:
+                                document.metadata["page"] = document.metadata["page_label"]
+                            document.metadata["attachment"] = part["filename"]
+                            document.metadata = {
+                                key: value
+                                for key, value in document.metadata.items()
+                                if key in ["attachment", "page"]
+                            }
+                            document.metadata.update(metadata)
+                            documents.append(document)
+                            ids.append(f"{msg_id}-{hashlib.sha256(file_data).hexdigest()}-{index}")
+            elif msg["payload"]["mimeType"] == "text/plain" and "data" in msg["payload"]["body"]:
+                body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
+                body = re.sub(r"<[^>]+>", "", body)
+                metadata["mimeType"] = msg["payload"]["mimeType"]
+                documents.append(Document(page_content=body, metadata=metadata))
+                ids.append(msg_id)
+            elif msg["payload"]["mimeType"] == "text/html" and "data" in msg["payload"]["body"]:
+                body = base64.urlsafe_b64decode(msg["payload"]["body"]["data"]).decode("utf-8")
+                body = re.sub(r"<[^>]+>", "", body)
+                metadata["mimeType"] = msg["payload"]["mimeType"]
+                documents.append(Document(page_content=body, metadata=metadata))
+                ids.append(msg_id)
+            if "multipart/alternative" in mime_types and len(mime_types) == 1:
+                logger.info("Only multipart/alternative found in the email.")
+            else:
+                try:
+                    vectorstore.add_documents(documents=documents, ids=ids)
+                except ValueError as e:
+                    logger.error("Error adding documents to vectorstore: %s", e)
+    def search(self, query, max_results=10, check_next_page=False) -> list:
+        """
+        Searches for Gmail messages based on a query string.
+        Args:
+            query (str): The search query string to filter messages.
+            max_results (int, optional): The maximum number of results to retrieve per page.
+            check_next_page (bool, optional): if to fetch additional pages of results if available.
+        Returns:
+            list: A list of message metadata dict. Each dictionary contains info about a message.
+        Notes:
+            - The `query` parameter supports Gmail's advanced search operators.
+            - If `check_next_page` is True, will continue fetching messages until all are retrieved.
+        """
+        query = self.parse_query(query.dict())
+        result = self.service.users().messages().list(
+            userId='me', q=query, maxResults=max_results).execute()
+        messages = []
+        if "messages" in result:
+            messages.extend(result["messages"])
+        while "nextPageToken" in result and check_next_page:
+            page_token = result["nextPageToken"]
+            result = (
+                self.service.users().messages().list(
+                    userId="me", q=query, maxResults=max_results, pageToken=page_token).execute()
+            )
+            if "messages" in result:
+                messages.extend(result["messages"])
+        return messages
+    def get(self, query, max_results=10) -> list:
+        """
+        Retrieve a list of emails with subject, to, from, cc, and content.
+        Args:
+            mailservice: Authenticated Gmail API service instance
+            max_results: Maximum number of emails to retrieve
+        Returns:
+            List of dictionaries containing email details
+        """
+        try:
+            messages = self.search(query, max_results)
+            email_list = []
+            if not messages:
+                return email_list
+            for message in messages:
+                msg = self.service.users().messages().get(
+                    userId='me', id=message['id'], format='full').execute()
+                headers = msg['payload']['headers']
+                email_data = {
+                    'subject': '',
+                    'from': '',
+                    'to': '',
+                    'cc': '',
+                    'content': '',
+                    'snippet': msg['snippet'] if 'snippet' in msg else '',
+                }
+                for header in headers:
+                    name = header['name'].lower()
+                    if name == 'subject':
+                        email_data['subject'] = header['value']
+                    elif name == 'from':
+                        email_data['from'] = header['value']
+                    elif name == 'to':
+                        email_data['to'] = header['value']
+                    elif name == 'cc':
+                        email_data['cc'] = header['value']
+                if 'parts' in msg['payload']:
+                    for part in msg['payload']['parts']:
+                        if part['mimeType'] == 'text/plain':
+                            email_data['content'] = base64.urlsafe_b64decode(
+                                part['body']['data']).decode('utf-8')
+                            break
+                        elif part['mimeType'] == 'text/html':
+                            email_data['content'] = base64.urlsafe_b64decode(
+                                part['body']['data']).decode('utf-8')
+                            break
+                elif 'data' in msg['payload']['body']:
+                    email_data['content'] = base64.urlsafe_b64decode(
+                        msg['payload']['body']['data']).decode('utf-8')
+                email_list.append(email_data)
+            return email_list
+        except (KeyError, ValueError, TypeError) as e:
+            logger.info("An error occurred: %s", e)
+            return []