"""
contains all the functions to extract the tables, images and, text from the converted 
documents.
"""

import os
import re

from typing import List

from docling.chunking import HybridChunker
from docling_core.types.doc.document import TableItem
from langchain_core.documents import Document
from docling_core.types.doc.labels import DocItemLabel

from docling_core.types.doc.document import TableItem
from transformers import AutoTokenizer
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker

__all__ = [
    "sanitize_name",
    "rename_items",
    "find_matching_fig_ref",
    "find_image_by_number",
    "extract_images",
    "extract_tables",
    "extract_texts",
    "find_relevant_folder"
]


def sanitize_name(name:str)-> str:
    """Replace '-', '_', and '–' with a single hyphen '-' and remove extra spaces.

    Args:
        name (str): file or folder name

    Returns:
        str: processed name
    """
     # Replace -, _, – with '-'
    name = re.sub(r'[-_– ]+', '-', name) 
    # Replace multiple spaces with a single space
    name = re.sub(r'\s+', ' ', name).strip()  
    return name

def rename_items(directory:str):
    """Rename all files and folders inside the given directory.

    Args:
        directory (str): file or folder name
    """
    items = os.listdir(directory)  # Get all files and folders inside the directory
    for item in items:
        old_path = os.path.join(directory, item)
        new_name = sanitize_name(item)  # Clean up the name
        new_path = os.path.join(directory, new_name)

        if old_path != new_path:  # Rename only if the name changes
            os.rename(old_path, new_path)
            print(f"Renamed: {old_path} -> {new_path}")

def find_matching_fig_ref(doc1:dict, doc2:dict)-> str|None:
    """Check the texts ids from text chunks metadata and pictures metadata if any id 
    matches then returns the image id.

    Args:
        doc1 (dict): text chunks metadata
        doc2 (dict): picture metadata

    Returns:
        str|None: if similar text id matched in both the metadata then returns the
        figure reference which is figure number. if no match None
    """

    # Extract and split self_ref and parent_ref into sets
    doc1_self_refs = set(doc1['self_ref'].split())  # Split multiple self_refs
    doc1_parent_refs = set(doc1['parent_ref'].split())  # Split multiple parent_refs

    # Extract text_ref and fig_ref from doc2
    doc2_text_ref = doc2['text_ref']
    doc2_fig_ref = doc2['fig_ref']

    # Check if text_ref exists in self_ref or parent_ref
    if doc2_text_ref in doc1_self_refs or doc2_text_ref in doc1_parent_refs:
        return doc2_fig_ref  # Return fig_ref if there's a match
    return None  # No match found

def find_image_by_number(folder_path: str, img_number:int)-> str|None:
    """Search for an image with the specified number in the folder.

    Args:
        folder_path (str): artifacts path where all the images were stored.
        img_number (int): image id

    Returns:
        str|None: image path 
    """
    
    pattern = re.compile(rf"image-0*{img_number}-[a-fA-F0-9]+\.png")  # Regex pattern

    for filename in os.listdir(folder_path):
        if pattern.match(filename):  # Check if the filename matches the pattern
            return os.path.join(folder_path, filename)  # Return full path

    return None  # Return None if no match found

def extract_images(conv_document: Document) -> Document:
    """Extract the images from the converted document and add the metadata.

    Args:
        conv_document (Document): converted document 

    Returns:
        Document: pictures with the metadata.
    """

    pictures: list[Document] = []
    for picture in conv_document.pictures:
        figure_ref = picture.get_ref().cref
        text_ref = picture.parent.get_ref().cref
        document = Document(
                page_content="",
                metadata={
                    "fig_ref": figure_ref,
                    "text_ref": text_ref,
                },)
        pictures.append(document)
    return pictures
  
def extract_tables(document: Document,
                   file_name: str) -> list[TableItem]:
    """Extract the tables from the converted document and add metadata.

    Args:
        document (Document): converted document.
        file_name (str): file name.

    Returns:
        list[TableItem]: A list of documents containing table data with 
        reference IDs in the metadata.
    """
    tables = []
    for table in document.tables:
        if table.label in [DocItemLabel.TABLE]:

            self_refs = table.get_ref().cref
            parent_refs = table.parent.get_ref().cref if table.parent else ""

            text = table.export_to_markdown()
            document = Document(
                page_content=text,
                metadata={
                    "source": file_name,
                    "self_ref": self_refs,
                    "parent_ref": parent_refs,

                },
            )
            tables.append(document)
    return tables

def extract_texts(conv_document: Document, 
                  pictures:List[Document], 
                  images_artifacts: str, 
                  embeddings_tokenizer: AutoTokenizer,
                  file_name: str
                  )-> List[Document]:
    """Extract the text data from converted document and add the image path in the
       metadata.

    Args:
        conv_document (Document): converted document.
        pictures (List[Document]): extracted pictures list.
        images_artifacts (str): artifacts path to extact image path.
        embeddings_tokenizer (AutoTokenizer): tokenizer to chunk the texts.
        file_name (str): file name.

    Returns:
        List[Document]: chunks with updated metadata.
    """
    texts = []
    doc_id = 0
    for chunk in HybridChunker(tokenizer=embeddings_tokenizer).chunk(conv_document):
        items = chunk.meta.doc_items
        self_refs = " ".join(map(lambda item: item.get_ref().cref, items))
        parent_refs = items[0].parent.get_ref().cref if len(items) > 0 else ""
        meta_data_dict = {
            "source": file_name,
            "self_ref": self_refs,
            "parent_ref": parent_refs,
        }

        for picture in pictures:
            fig_metadata = picture.metadata
            fig_ref = find_matching_fig_ref(meta_data_dict, fig_metadata)
            if fig_ref:
                fig_number = int(fig_ref.split("/")[-1])
                image_path = find_image_by_number(images_artifacts, fig_number)
                meta_data_dict["fig_ref"] = image_path
                meta_data_dict["fig_number"] = fig_number

        text = chunk.text
        document = Document(
                page_content=text,
                metadata= meta_data_dict,
            )
        texts.append(document)
    return texts


def find_relevant_folder(folder_path:str)->dict:
    """create a dict with markdown file(key) and 
       artfacts (value).

    Args:
        folder_path (str): folder path where all the converted documents are stored.

    Returns:
        dict: dict with file with artifacts folder
    """
    # Renaming the files and folders by removing the spaces
    rename_items(folder_path)

    # Initialize the dataset dictionary
    dataset_dict = {}

    # Get all files and folders in the directory (do this only once)
    all_items = os.listdir(folder_path)

    # Split files and folders in one pass
    md_files = {file for file in all_items if file.endswith(".md")}
    folders = {folder for folder in all_items if not folder.endswith(".md")}

    # Create a dictionary of folder name splits for efficient matching
    folder_splits = {tuple(folder.split("-")[:-2]): folder for folder in folders}

    for file in md_files:
        file_split = tuple(file.split("-")[:-1])

        # Check if file_split matches any folder's split
        if file_split in folder_splits:
            dataset_dict[file] = folder_splits[file_split]

    return dataset_dict


def extract_ref_text_ids(meta_data):
    all_refs = []

    # Go through all 3 ref fields
    for key in ["self_ref", "parent_ref", "child_ref"]:
        ref_str = meta_data.get(key)
        if ref_str:
            refs = ref_str.split(",")  # split in case of multiple refs
            all_refs.extend(refs)

    # Remove duplicates
    unique_refs = set(all_refs)

    # Extract /texts/ IDs as integers
    text_refs = [int(ref.split("/")[2]) for ref in unique_refs if "/texts/" in ref]

    return text_refs