Spaces:
Running
Running
File size: 8,565 Bytes
5e433de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 |
"""
contains all the functions to extract the tables, images and, text from the converted
documents.
"""
import os
import re
from typing import List
from docling.chunking import HybridChunker
from docling_core.types.doc.document import TableItem
from langchain_core.documents import Document
from docling_core.types.doc.labels import DocItemLabel
from docling_core.types.doc.document import TableItem
from transformers import AutoTokenizer
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
__all__ = [
"sanitize_name",
"rename_items",
"find_matching_fig_ref",
"find_image_by_number",
"extract_images",
"extract_tables",
"extract_texts",
"find_relevant_folder"
]
def sanitize_name(name:str)-> str:
"""Replace '-', '_', and '–' with a single hyphen '-' and remove extra spaces.
Args:
name (str): file or folder name
Returns:
str: processed name
"""
# Replace -, _, – with '-'
name = re.sub(r'[-_– ]+', '-', name)
# Replace multiple spaces with a single space
name = re.sub(r'\s+', ' ', name).strip()
return name
def rename_items(directory:str):
"""Rename all files and folders inside the given directory.
Args:
directory (str): file or folder name
"""
items = os.listdir(directory) # Get all files and folders inside the directory
for item in items:
old_path = os.path.join(directory, item)
new_name = sanitize_name(item) # Clean up the name
new_path = os.path.join(directory, new_name)
if old_path != new_path: # Rename only if the name changes
os.rename(old_path, new_path)
print(f"Renamed: {old_path} -> {new_path}")
def find_matching_fig_ref(doc1:dict, doc2:dict)-> str|None:
"""Check the texts ids from text chunks metadata and pictures metadata if any id
matches then returns the image id.
Args:
doc1 (dict): text chunks metadata
doc2 (dict): picture metadata
Returns:
str|None: if similar text id matched in both the metadata then returns the
figure reference which is figure number. if no match None
"""
# Extract and split self_ref and parent_ref into sets
doc1_self_refs = set(doc1['self_ref'].split()) # Split multiple self_refs
doc1_parent_refs = set(doc1['parent_ref'].split()) # Split multiple parent_refs
# Extract text_ref and fig_ref from doc2
doc2_text_ref = doc2['text_ref']
doc2_fig_ref = doc2['fig_ref']
# Check if text_ref exists in self_ref or parent_ref
if doc2_text_ref in doc1_self_refs or doc2_text_ref in doc1_parent_refs:
return doc2_fig_ref # Return fig_ref if there's a match
return None # No match found
def find_image_by_number(folder_path: str, img_number:int)-> str|None:
"""Search for an image with the specified number in the folder.
Args:
folder_path (str): artifacts path where all the images were stored.
img_number (int): image id
Returns:
str|None: image path
"""
pattern = re.compile(rf"image-0*{img_number}-[a-fA-F0-9]+\.png") # Regex pattern
for filename in os.listdir(folder_path):
if pattern.match(filename): # Check if the filename matches the pattern
return os.path.join(folder_path, filename) # Return full path
return None # Return None if no match found
def extract_images(conv_document: Document) -> Document:
"""Extract the images from the converted document and add the metadata.
Args:
conv_document (Document): converted document
Returns:
Document: pictures with the metadata.
"""
pictures: list[Document] = []
for picture in conv_document.pictures:
figure_ref = picture.get_ref().cref
text_ref = picture.parent.get_ref().cref
document = Document(
page_content="",
metadata={
"fig_ref": figure_ref,
"text_ref": text_ref,
},)
pictures.append(document)
return pictures
def extract_tables(document: Document,
file_name: str) -> list[TableItem]:
"""Extract the tables from the converted document and add metadata.
Args:
document (Document): converted document.
file_name (str): file name.
Returns:
list[TableItem]: A list of documents containing table data with
reference IDs in the metadata.
"""
tables = []
for table in document.tables:
if table.label in [DocItemLabel.TABLE]:
self_refs = table.get_ref().cref
parent_refs = table.parent.get_ref().cref if table.parent else ""
text = table.export_to_markdown()
document = Document(
page_content=text,
metadata={
"source": file_name,
"self_ref": self_refs,
"parent_ref": parent_refs,
},
)
tables.append(document)
return tables
def extract_texts(conv_document: Document,
pictures:List[Document],
images_artifacts: str,
embeddings_tokenizer: AutoTokenizer,
file_name: str
)-> List[Document]:
"""Extract the text data from converted document and add the image path in the
metadata.
Args:
conv_document (Document): converted document.
pictures (List[Document]): extracted pictures list.
images_artifacts (str): artifacts path to extact image path.
embeddings_tokenizer (AutoTokenizer): tokenizer to chunk the texts.
file_name (str): file name.
Returns:
List[Document]: chunks with updated metadata.
"""
texts = []
doc_id = 0
for chunk in HybridChunker(tokenizer=embeddings_tokenizer).chunk(conv_document):
items = chunk.meta.doc_items
self_refs = " ".join(map(lambda item: item.get_ref().cref, items))
parent_refs = items[0].parent.get_ref().cref if len(items) > 0 else ""
meta_data_dict = {
"source": file_name,
"self_ref": self_refs,
"parent_ref": parent_refs,
}
for picture in pictures:
fig_metadata = picture.metadata
fig_ref = find_matching_fig_ref(meta_data_dict, fig_metadata)
if fig_ref:
fig_number = int(fig_ref.split("/")[-1])
image_path = find_image_by_number(images_artifacts, fig_number)
meta_data_dict["fig_ref"] = image_path
meta_data_dict["fig_number"] = fig_number
text = chunk.text
document = Document(
page_content=text,
metadata= meta_data_dict,
)
texts.append(document)
return texts
def find_relevant_folder(folder_path:str)->dict:
"""create a dict with markdown file(key) and
artfacts (value).
Args:
folder_path (str): folder path where all the converted documents are stored.
Returns:
dict: dict with file with artifacts folder
"""
# Renaming the files and folders by removing the spaces
rename_items(folder_path)
# Initialize the dataset dictionary
dataset_dict = {}
# Get all files and folders in the directory (do this only once)
all_items = os.listdir(folder_path)
# Split files and folders in one pass
md_files = {file for file in all_items if file.endswith(".md")}
folders = {folder for folder in all_items if not folder.endswith(".md")}
# Create a dictionary of folder name splits for efficient matching
folder_splits = {tuple(folder.split("-")[:-2]): folder for folder in folders}
for file in md_files:
file_split = tuple(file.split("-")[:-1])
# Check if file_split matches any folder's split
if file_split in folder_splits:
dataset_dict[file] = folder_splits[file_split]
return dataset_dict
def extract_ref_text_ids(meta_data):
all_refs = []
# Go through all 3 ref fields
for key in ["self_ref", "parent_ref", "child_ref"]:
ref_str = meta_data.get(key)
if ref_str:
refs = ref_str.split(",") # split in case of multiple refs
all_refs.extend(refs)
# Remove duplicates
unique_refs = set(all_refs)
# Extract /texts/ IDs as integers
text_refs = [int(ref.split("/")[2]) for ref in unique_refs if "/texts/" in ref]
return text_refs |