"""Module to extract text from PDF files and images using Azure OpenAI's GPT-4o-mini model."""
import base64
import hashlib
import json
import os
from io import BytesIO

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import Docx2txtLoader
from langchain_core.documents import Document
from pdf2image import convert_from_path
from pydantic import BaseModel
from pypdf import PdfReader

from models.llm import client
from models.db import vectorstore

text_splitter = RecursiveCharacterTextSplitter()

class ExtractionResult(BaseModel):
    """
    ExtractionResult is a data model that represents the result of an extraction process.

    Attributes:
        content (str): The extracted content as a string.
    """
    content: str

def check_image(page):
    """
    Checks if a given PDF page contains any images.

    This function examines the /Resources dictionary of the provided PDF page
    to determine if it contains any XObjects of subtype /Image.

    Args:
        page: A dictionary-like object representing a PDF page.

    Returns:
        bool: True if the page contains at least one image, False otherwise.
    """
    # Get the /Resources dictionary
    resources = page.get("/Resources")
    if resources is None:
        return False
    # Check for /XObject in resources
    xobjects = resources.get("/XObject")
    if xobjects is None:
        return False
    # Iterate through XObjects to find images
    for obj in xobjects.values():
        if obj.get("/Subtype") == "/Image":
            return True
    return False

def extract_text_from_image(image):
    """
    Extracts text content from an image of a document page and returns it as structured JSON.

    Args:
        image (PIL.Image.Image): The image object representing the document page.

    Returns:
        str: The extracted plain text content of the page in JSON format.

    Raises:
        Exception: If the response from the AI model is invalid or cannot be parsed.

    Dependencies:
        - Requires the `BytesIO` module for handling image byte streams.
        - Requires the `base64` module for encoding the image in Base64 format.
        - Requires a client instance capable of interacting with the GPT-4o-mini model.
    """
    image_bytes = BytesIO()
    image.save(image_bytes, format="PNG")
    image_bytes = image_bytes.getvalue()
    base64_image = base64.b64encode(image_bytes).decode("utf-8")
    prompt = """
    You are an AI assistant that extracts data from documents and returns it as structured JSON.
    Analyze the provided image of a document page and extract the following:
    - Content of the page (plain text)
    """
    response = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        response_format = ExtractionResult,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{base64_image}"}
                    }
                ]
            }
        ]
    )
    return json.loads(response.choices[0].message.content)["content"]

def load_pdf(content: bytes, filename: str):
    """
    Loads and processes PDF files from a specified directory.

    This function iterates through all PDF files in the given directory, extracts text
    from each page, and creates a list of Document objects containing the extracted text
    and metadata. If a page contains an image, the text is extracted from the image using
    OCR.

    Args:
        directory (str): The path to the directory containing the PDF files.

    Returns:
        list: A list of Document objects, where each object contains the page content
              and metadata (filename and page number).

    Raises:
        FileNotFoundError: If a specified PDF file is not found.
        Exception: For any other errors encountered during processing.

    Notes:
        - The function assumes the presence of helper functions `check_image`, 
          `convert_from_path`, and `extract_text_from_image`.
        - The `Document` class is used to store the page content and metadata.
    """
    documents = []
    path = os.path.join("/tmp", filename)
    with open(path, "wb") as f:
        f.write(content)
    try:
        pdf = PdfReader(path)
        for page_num, page in enumerate(pdf.pages):
            contain = check_image(page)
            if contain:
                images = convert_from_path(
                    path, first_page=page_num + 1, last_page=page_num + 1)
                text = extract_text_from_image(images[0])
            else:
                text = page.extract_text()
            doc = Document(
                page_content=text,
                metadata={"source": filename, "page": page_num + 1})
            documents.append(doc.model_dump())
        os.remove(path)
        return documents
    except (FileNotFoundError, ValueError, OSError) as e:
        print(f"Error: {str(e)}")
        return documents

def load_jsonl(directory):
    """
    Reads a JSONL file and converts its content into a list of Document objects.

    Args:
        path (str): Path to the JSONL file.

    Returns:
        list: A list of Document objects.
    """
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            documents = []
            file_path = os.path.join(directory, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                for line in file:
                    # Parse each line as JSON
                    json_obj = json.loads(line.strip())
                    metadata = {
                        "id": json_obj.get("id", ""),
                        "url": json_obj.get("url", ""),
                        "title": json_obj.get("title", ""),
                        "ts": json_obj.get("ts", "")
                    }
                    if json_obj.get("mine") == "text/html":
                        text = base64.urlsafe_b64decode(json_obj.get("text", "")).decode("utf-8")
                    else:
                        text = json_obj.get("text", "")
                    doc = Document(page_content=text, metadata=metadata)
                    documents.append(doc)
            documents = []

def load_docx(directory):
    """
    Loads and processes all .docx files from a specified directory.

    This function iterates through the files in the given directory, identifies
    files with a .docx extension, and uses the Docx2txtLoader to load and extract
    their contents. The extracted contents are aggregated into a single list.

    Args:
        directory (str): The path to the directory containing .docx files.

    Returns:
        list: A list containing the contents of all loaded .docx files.
    """
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith(".docx"):
            documents.extend(Docx2txtLoader(file_path=os.path.join(directory, filename)).load())
    upload(documents)
    return documents

def upload(docs):
    """
    Processes a list of documents, splits them into smaller chunks, updates their metadata,
    generates unique IDs for each chunk, and adds them to a vector store.

    Args:
        docs (list): A list of document objects to be processed.

    Metadata Processing:
        - Extracts and updates the "page" metadata if "page_label" exists.
        - Updates the "attachment" metadata by removing the "{FOLDER}/" prefix from the "source".
        - Filters metadata to retain only "attachment" and "page" keys.
        - Generates a unique "id" for each document based on the "attachment" metadata.
        - Constructs unique IDs for each document chunk, incorporating "id", "page", and chunk index.

    Operations:
        - Splits each document into smaller chunks using `text_splitter.split_documents`.
        - Appends processed document chunks and their IDs to the `documents` and `ids` lists.
        - Adds the processed documents and their IDs to the `vector_store`.

    Raises:
        KeyError: If required metadata keys are missing during processing.
    """
    documents = []
    ids = []
    for doc in docs:
        for index, document in enumerate(text_splitter.split_documents([doc])):
            if "page_label" in document.metadata:
                document.metadata["page"] = int(document.metadata["page_label"])
            document.metadata["attachment"] = document.metadata["source"].replace("{FOLDER}/", "")
            document.metadata = {
                key: value
                for key, value in document.metadata.items()
                if key in ["attachment", "page"]
            }
            document.metadata["id"] = str(
                hashlib.sha256(document.metadata['attachment'].encode()).hexdigest())
            if "page" in document.metadata:
                ids.append(f"{document.metadata['id']}-{document.metadata['page']}-{index}")
            else:
                ids.append(f"{document.metadata['id']}-{index}")
            documents.append(document)
    vectorstore.add_documents(documents=documents, ids=ids)