File size: 8,565 Bytes
5e433de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
"""
contains all the functions to extract the tables, images and, text from the converted 
documents.
"""

import os
import re

from typing import List

from docling.chunking import HybridChunker
from docling_core.types.doc.document import TableItem
from langchain_core.documents import Document
from docling_core.types.doc.labels import DocItemLabel

from docling_core.types.doc.document import TableItem
from transformers import AutoTokenizer
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker

__all__ = [
    "sanitize_name",
    "rename_items",
    "find_matching_fig_ref",
    "find_image_by_number",
    "extract_images",
    "extract_tables",
    "extract_texts",
    "find_relevant_folder"
]


def sanitize_name(name:str)-> str:
    """Replace '-', '_', and '–' with a single hyphen '-' and remove extra spaces.

    Args:
        name (str): file or folder name

    Returns:
        str: processed name
    """
     # Replace -, _, – with '-'
    name = re.sub(r'[-_– ]+', '-', name) 
    # Replace multiple spaces with a single space
    name = re.sub(r'\s+', ' ', name).strip()  
    return name

def rename_items(directory:str):
    """Rename all files and folders inside the given directory.

    Args:
        directory (str): file or folder name
    """
    items = os.listdir(directory)  # Get all files and folders inside the directory
    for item in items:
        old_path = os.path.join(directory, item)
        new_name = sanitize_name(item)  # Clean up the name
        new_path = os.path.join(directory, new_name)

        if old_path != new_path:  # Rename only if the name changes
            os.rename(old_path, new_path)
            print(f"Renamed: {old_path} -> {new_path}")

def find_matching_fig_ref(doc1:dict, doc2:dict)-> str|None:
    """Check the texts ids from text chunks metadata and pictures metadata if any id 
    matches then returns the image id.

    Args:
        doc1 (dict): text chunks metadata
        doc2 (dict): picture metadata

    Returns:
        str|None: if similar text id matched in both the metadata then returns the
        figure reference which is figure number. if no match None
    """

    # Extract and split self_ref and parent_ref into sets
    doc1_self_refs = set(doc1['self_ref'].split())  # Split multiple self_refs
    doc1_parent_refs = set(doc1['parent_ref'].split())  # Split multiple parent_refs

    # Extract text_ref and fig_ref from doc2
    doc2_text_ref = doc2['text_ref']
    doc2_fig_ref = doc2['fig_ref']

    # Check if text_ref exists in self_ref or parent_ref
    if doc2_text_ref in doc1_self_refs or doc2_text_ref in doc1_parent_refs:
        return doc2_fig_ref  # Return fig_ref if there's a match
    return None  # No match found

def find_image_by_number(folder_path: str, img_number:int)-> str|None:
    """Search for an image with the specified number in the folder.

    Args:
        folder_path (str): artifacts path where all the images were stored.
        img_number (int): image id

    Returns:
        str|None: image path 
    """
    
    pattern = re.compile(rf"image-0*{img_number}-[a-fA-F0-9]+\.png")  # Regex pattern

    for filename in os.listdir(folder_path):
        if pattern.match(filename):  # Check if the filename matches the pattern
            return os.path.join(folder_path, filename)  # Return full path

    return None  # Return None if no match found

def extract_images(conv_document: Document) -> Document:
    """Extract the images from the converted document and add the metadata.

    Args:
        conv_document (Document): converted document 

    Returns:
        Document: pictures with the metadata.
    """

    pictures: list[Document] = []
    for picture in conv_document.pictures:
        figure_ref = picture.get_ref().cref
        text_ref = picture.parent.get_ref().cref
        document = Document(
                page_content="",
                metadata={
                    "fig_ref": figure_ref,
                    "text_ref": text_ref,
                },)
        pictures.append(document)
    return pictures
  
def extract_tables(document: Document,
                   file_name: str) -> list[TableItem]:
    """Extract the tables from the converted document and add metadata.

    Args:
        document (Document): converted document.
        file_name (str): file name.

    Returns:
        list[TableItem]: A list of documents containing table data with 
        reference IDs in the metadata.
    """
    tables = []
    for table in document.tables:
        if table.label in [DocItemLabel.TABLE]:

            self_refs = table.get_ref().cref
            parent_refs = table.parent.get_ref().cref if table.parent else ""

            text = table.export_to_markdown()
            document = Document(
                page_content=text,
                metadata={
                    "source": file_name,
                    "self_ref": self_refs,
                    "parent_ref": parent_refs,

                },
            )
            tables.append(document)
    return tables

def extract_texts(conv_document: Document, 
                  pictures:List[Document], 
                  images_artifacts: str, 
                  embeddings_tokenizer: AutoTokenizer,
                  file_name: str
                  )-> List[Document]:
    """Extract the text data from converted document and add the image path in the
       metadata.

    Args:
        conv_document (Document): converted document.
        pictures (List[Document]): extracted pictures list.
        images_artifacts (str): artifacts path to extact image path.
        embeddings_tokenizer (AutoTokenizer): tokenizer to chunk the texts.
        file_name (str): file name.

    Returns:
        List[Document]: chunks with updated metadata.
    """
    texts = []
    doc_id = 0
    for chunk in HybridChunker(tokenizer=embeddings_tokenizer).chunk(conv_document):
        items = chunk.meta.doc_items
        self_refs = " ".join(map(lambda item: item.get_ref().cref, items))
        parent_refs = items[0].parent.get_ref().cref if len(items) > 0 else ""
        meta_data_dict = {
            "source": file_name,
            "self_ref": self_refs,
            "parent_ref": parent_refs,
        }

        for picture in pictures:
            fig_metadata = picture.metadata
            fig_ref = find_matching_fig_ref(meta_data_dict, fig_metadata)
            if fig_ref:
                fig_number = int(fig_ref.split("/")[-1])
                image_path = find_image_by_number(images_artifacts, fig_number)
                meta_data_dict["fig_ref"] = image_path
                meta_data_dict["fig_number"] = fig_number

        text = chunk.text
        document = Document(
                page_content=text,
                metadata= meta_data_dict,
            )
        texts.append(document)
    return texts




def find_relevant_folder(folder_path:str)->dict:
    """create a dict with markdown file(key) and 
       artfacts (value).

    Args:
        folder_path (str): folder path where all the converted documents are stored.

    Returns:
        dict: dict with file with artifacts folder
    """
    # Renaming the files and folders by removing the spaces
    rename_items(folder_path)

    # Initialize the dataset dictionary
    dataset_dict = {}

    # Get all files and folders in the directory (do this only once)
    all_items = os.listdir(folder_path)

    # Split files and folders in one pass
    md_files = {file for file in all_items if file.endswith(".md")}
    folders = {folder for folder in all_items if not folder.endswith(".md")}

    # Create a dictionary of folder name splits for efficient matching
    folder_splits = {tuple(folder.split("-")[:-2]): folder for folder in folders}

    for file in md_files:
        file_split = tuple(file.split("-")[:-1])

        # Check if file_split matches any folder's split
        if file_split in folder_splits:
            dataset_dict[file] = folder_splits[file_split]

    return dataset_dict


def extract_ref_text_ids(meta_data):
    all_refs = []

    # Go through all 3 ref fields
    for key in ["self_ref", "parent_ref", "child_ref"]:
        ref_str = meta_data.get(key)
        if ref_str:
            refs = ref_str.split(",")  # split in case of multiple refs
            all_refs.extend(refs)

    # Remove duplicates
    unique_refs = set(all_refs)

    # Extract /texts/ IDs as integers
    text_refs = [int(ref.split("/")[2]) for ref in unique_refs if "/texts/" in ref]

    return text_refs