""" contains all the functions to extract the tables, images and, text from the converted documents. """ import os import re from typing import List from docling.chunking import HybridChunker from docling_core.types.doc.document import TableItem from langchain_core.documents import Document from docling_core.types.doc.labels import DocItemLabel from docling_core.types.doc.document import TableItem from transformers import AutoTokenizer from docling_core.transforms.chunker.hybrid_chunker import HybridChunker __all__ = [ "sanitize_name", "rename_items", "find_matching_fig_ref", "find_image_by_number", "extract_images", "extract_tables", "extract_texts", "find_relevant_folder" ] def sanitize_name(name:str)-> str: """Replace '-', '_', and '–' with a single hyphen '-' and remove extra spaces. Args: name (str): file or folder name Returns: str: processed name """ # Replace -, _, – with '-' name = re.sub(r'[-_– ]+', '-', name) # Replace multiple spaces with a single space name = re.sub(r'\s+', ' ', name).strip() return name def rename_items(directory:str): """Rename all files and folders inside the given directory. Args: directory (str): file or folder name """ items = os.listdir(directory) # Get all files and folders inside the directory for item in items: old_path = os.path.join(directory, item) new_name = sanitize_name(item) # Clean up the name new_path = os.path.join(directory, new_name) if old_path != new_path: # Rename only if the name changes os.rename(old_path, new_path) print(f"Renamed: {old_path} -> {new_path}") def find_matching_fig_ref(doc1:dict, doc2:dict)-> str|None: """Check the texts ids from text chunks metadata and pictures metadata if any id matches then returns the image id. Args: doc1 (dict): text chunks metadata doc2 (dict): picture metadata Returns: str|None: if similar text id matched in both the metadata then returns the figure reference which is figure number. if no match None """ # Extract and split self_ref and parent_ref into sets doc1_self_refs = set(doc1['self_ref'].split()) # Split multiple self_refs doc1_parent_refs = set(doc1['parent_ref'].split()) # Split multiple parent_refs # Extract text_ref and fig_ref from doc2 doc2_text_ref = doc2['text_ref'] doc2_fig_ref = doc2['fig_ref'] # Check if text_ref exists in self_ref or parent_ref if doc2_text_ref in doc1_self_refs or doc2_text_ref in doc1_parent_refs: return doc2_fig_ref # Return fig_ref if there's a match return None # No match found def find_image_by_number(folder_path: str, img_number:int)-> str|None: """Search for an image with the specified number in the folder. Args: folder_path (str): artifacts path where all the images were stored. img_number (int): image id Returns: str|None: image path """ pattern = re.compile(rf"image-0*{img_number}-[a-fA-F0-9]+\.png") # Regex pattern for filename in os.listdir(folder_path): if pattern.match(filename): # Check if the filename matches the pattern return os.path.join(folder_path, filename) # Return full path return None # Return None if no match found def extract_images(conv_document: Document) -> Document: """Extract the images from the converted document and add the metadata. Args: conv_document (Document): converted document Returns: Document: pictures with the metadata. """ pictures: list[Document] = [] for picture in conv_document.pictures: figure_ref = picture.get_ref().cref text_ref = picture.parent.get_ref().cref document = Document( page_content="", metadata={ "fig_ref": figure_ref, "text_ref": text_ref, },) pictures.append(document) return pictures def extract_tables(document: Document, file_name: str) -> list[TableItem]: """Extract the tables from the converted document and add metadata. Args: document (Document): converted document. file_name (str): file name. Returns: list[TableItem]: A list of documents containing table data with reference IDs in the metadata. """ tables = [] for table in document.tables: if table.label in [DocItemLabel.TABLE]: self_refs = table.get_ref().cref parent_refs = table.parent.get_ref().cref if table.parent else "" text = table.export_to_markdown() document = Document( page_content=text, metadata={ "source": file_name, "self_ref": self_refs, "parent_ref": parent_refs, }, ) tables.append(document) return tables def extract_texts(conv_document: Document, pictures:List[Document], images_artifacts: str, embeddings_tokenizer: AutoTokenizer, file_name: str )-> List[Document]: """Extract the text data from converted document and add the image path in the metadata. Args: conv_document (Document): converted document. pictures (List[Document]): extracted pictures list. images_artifacts (str): artifacts path to extact image path. embeddings_tokenizer (AutoTokenizer): tokenizer to chunk the texts. file_name (str): file name. Returns: List[Document]: chunks with updated metadata. """ texts = [] doc_id = 0 for chunk in HybridChunker(tokenizer=embeddings_tokenizer).chunk(conv_document): items = chunk.meta.doc_items self_refs = " ".join(map(lambda item: item.get_ref().cref, items)) parent_refs = items[0].parent.get_ref().cref if len(items) > 0 else "" meta_data_dict = { "source": file_name, "self_ref": self_refs, "parent_ref": parent_refs, } for picture in pictures: fig_metadata = picture.metadata fig_ref = find_matching_fig_ref(meta_data_dict, fig_metadata) if fig_ref: fig_number = int(fig_ref.split("/")[-1]) image_path = find_image_by_number(images_artifacts, fig_number) meta_data_dict["fig_ref"] = image_path meta_data_dict["fig_number"] = fig_number text = chunk.text document = Document( page_content=text, metadata= meta_data_dict, ) texts.append(document) return texts def find_relevant_folder(folder_path:str)->dict: """create a dict with markdown file(key) and artfacts (value). Args: folder_path (str): folder path where all the converted documents are stored. Returns: dict: dict with file with artifacts folder """ # Renaming the files and folders by removing the spaces rename_items(folder_path) # Initialize the dataset dictionary dataset_dict = {} # Get all files and folders in the directory (do this only once) all_items = os.listdir(folder_path) # Split files and folders in one pass md_files = {file for file in all_items if file.endswith(".md")} folders = {folder for folder in all_items if not folder.endswith(".md")} # Create a dictionary of folder name splits for efficient matching folder_splits = {tuple(folder.split("-")[:-2]): folder for folder in folders} for file in md_files: file_split = tuple(file.split("-")[:-1]) # Check if file_split matches any folder's split if file_split in folder_splits: dataset_dict[file] = folder_splits[file_split] return dataset_dict def extract_ref_text_ids(meta_data): all_refs = [] # Go through all 3 ref fields for key in ["self_ref", "parent_ref", "child_ref"]: ref_str = meta_data.get(key) if ref_str: refs = ref_str.split(",") # split in case of multiple refs all_refs.extend(refs) # Remove duplicates unique_refs = set(all_refs) # Extract /texts/ IDs as integers text_refs = [int(ref.split("/")[2]) for ref in unique_refs if "/texts/" in ref] return text_refs