Spaces:

pasupuletkarthiksai
/

medico-bot

Running

File size: 8,565 Bytes

5e433de

"""
contains all the functions to extract the tables, images and, text from the converted 
documents.
"""

import os
import re

from typing import List

from docling.chunking import HybridChunker
from docling_core.types.doc.document import TableItem
from langchain_core.documents import Document
from docling_core.types.doc.labels import DocItemLabel

from docling_core.types.doc.document import TableItem
from transformers import AutoTokenizer
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker

__all__ = [
    "sanitize_name",
    "rename_items",
    "find_matching_fig_ref",
    "find_image_by_number",
    "extract_images",
    "extract_tables",
    "extract_texts",
    "find_relevant_folder"
]


def sanitize_name(name:str)-> str:
    """Replace '-', '_', and '–' with a single hyphen '-' and remove extra spaces.

    Args:
        name (str): file or folder name

    Returns:
        str: processed name
    """
     # Replace -, _, – with '-'
    name = re.sub(r'[-_– ]+', '-', name) 
    # Replace multiple spaces with a single space
    name = re.sub(r'\s+', ' ', name).strip()  
    return name

def rename_items(directory:str):
    """Rename all files and folders inside the given directory.

    Args:
        directory (str): file or folder name
    """
    items = os.listdir(directory)  # Get all files and folders inside the directory
    for item in items:
        old_path = os.path.join(directory, item)
        new_name = sanitize_name(item)  # Clean up the name
        new_path = os.path.join(directory, new_name)

        if old_path != new_path:  # Rename only if the name changes
            os.rename(old_path, new_path)
            print(f"Renamed: {old_path} -> {new_path}")

def find_matching_fig_ref(doc1:dict, doc2:dict)-> str|None:
    """Check the texts ids from text chunks metadata and pictures metadata if any id 
    matches then returns the image id.

    Args:
        doc1 (dict): text chunks metadata
        doc2 (dict): picture metadata

    Returns:
        str|None: if similar text id matched in both the metadata then returns the
        figure reference which is figure number. if no match None
    """

    # Extract and split self_ref and parent_ref into sets
    doc1_self_refs = set(doc1['self_ref'].split())  # Split multiple self_refs
    doc1_parent_refs = set(doc1['parent_ref'].split())  # Split multiple parent_refs

    # Extract text_ref and fig_ref from doc2
    doc2_text_ref = doc2['text_ref']
    doc2_fig_ref = doc2['fig_ref']

    # Check if text_ref exists in self_ref or parent_ref
    if doc2_text_ref in doc1_self_refs or doc2_text_ref in doc1_parent_refs:
        return doc2_fig_ref  # Return fig_ref if there's a match
    return None  # No match found

def find_image_by_number(folder_path: str, img_number:int)-> str|None:
    """Search for an image with the specified number in the folder.

    Args:
        folder_path (str): artifacts path where all the images were stored.
        img_number (int): image id

    Returns:
        str|None: image path 
    """
    
    pattern = re.compile(rf"image-0*{img_number}-[a-fA-F0-9]+\.png")  # Regex pattern

    for filename in os.listdir(folder_path):
        if pattern.match(filename):  # Check if the filename matches the pattern
            return os.path.join(folder_path, filename)  # Return full path

    return None  # Return None if no match found

def extract_images(conv_document: Document) -> Document:
    """Extract the images from the converted document and add the metadata.

    Args:
        conv_document (Document): converted document 

    Returns:
        Document: pictures with the metadata.
    """

    pictures: list[Document] = []
    for picture in conv_document.pictures:
        figure_ref = picture.get_ref().cref
        text_ref = picture.parent.get_ref().cref
        document = Document(
                page_content="",
                metadata={
                    "fig_ref": figure_ref,
                    "text_ref": text_ref,
                },)
        pictures.append(document)
    return pictures
  
def extract_tables(document: Document,
                   file_name: str) -> list[TableItem]:
    """Extract the tables from the converted document and add metadata.

    Args:
        document (Document): converted document.
        file_name (str): file name.

    Returns:
        list[TableItem]: A list of documents containing table data with 
        reference IDs in the metadata.
    """
    tables = []
    for table in document.tables:
        if table.label in [DocItemLabel.TABLE]:

            self_refs = table.get_ref().cref
            parent_refs = table.parent.get_ref().cref if table.parent else ""

            text = table.export_to_markdown()
            document = Document(
                page_content=text,
                metadata={
                    "source": file_name,
                    "self_ref": self_refs,
                    "parent_ref": parent_refs,

                },
            )
            tables.append(document)
    return tables

def extract_texts(conv_document: Document, 
                  pictures:List[Document], 
                  images_artifacts: str, 
                  embeddings_tokenizer: AutoTokenizer,
                  file_name: str
                  )-> List[Document]:
    """Extract the text data from converted document and add the image path in the
       metadata.

    Args:
        conv_document (Document): converted document.
        pictures (List[Document]): extracted pictures list.
        images_artifacts (str): artifacts path to extact image path.
        embeddings_tokenizer (AutoTokenizer): tokenizer to chunk the texts.
        file_name (str): file name.

    Returns:
        List[Document]: chunks with updated metadata.
    """
    texts = []
    doc_id = 0
    for chunk in HybridChunker(tokenizer=embeddings_tokenizer).chunk(conv_document):
        items = chunk.meta.doc_items
        self_refs = " ".join(map(lambda item: item.get_ref().cref, items))
        parent_refs = items[0].parent.get_ref().cref if len(items) > 0 else ""
        meta_data_dict = {
            "source": file_name,
            "self_ref": self_refs,
            "parent_ref": parent_refs,
        }

        for picture in pictures:
            fig_metadata = picture.metadata
            fig_ref = find_matching_fig_ref(meta_data_dict, fig_metadata)
            if fig_ref:
                fig_number = int(fig_ref.split("/")[-1])
                image_path = find_image_by_number(images_artifacts, fig_number)
                meta_data_dict["fig_ref"] = image_path
                meta_data_dict["fig_number"] = fig_number

        text = chunk.text
        document = Document(
                page_content=text,
                metadata= meta_data_dict,
            )
        texts.append(document)
    return texts




def find_relevant_folder(folder_path:str)->dict:
    """create a dict with markdown file(key) and 
       artfacts (value).

    Args:
        folder_path (str): folder path where all the converted documents are stored.

    Returns:
        dict: dict with file with artifacts folder
    """
    # Renaming the files and folders by removing the spaces
    rename_items(folder_path)

    # Initialize the dataset dictionary
    dataset_dict = {}

    # Get all files and folders in the directory (do this only once)
    all_items = os.listdir(folder_path)

    # Split files and folders in one pass
    md_files = {file for file in all_items if file.endswith(".md")}
    folders = {folder for folder in all_items if not folder.endswith(".md")}

    # Create a dictionary of folder name splits for efficient matching
    folder_splits = {tuple(folder.split("-")[:-2]): folder for folder in folders}

    for file in md_files:
        file_split = tuple(file.split("-")[:-1])

        # Check if file_split matches any folder's split
        if file_split in folder_splits:
            dataset_dict[file] = folder_splits[file_split]

    return dataset_dict


def extract_ref_text_ids(meta_data):
    all_refs = []

    # Go through all 3 ref fields
    for key in ["self_ref", "parent_ref", "child_ref"]:
        ref_str = meta_data.get(key)
        if ref_str:
            refs = ref_str.split(",")  # split in case of multiple refs
            all_refs.extend(refs)

    # Remove duplicates
    unique_refs = set(all_refs)

    # Extract /texts/ IDs as integers
    text_refs = [int(ref.split("/")[2]) for ref in unique_refs if "/texts/" in ref]

    return text_refs