kap2403
"added files"
5e433de
"""
contains all the functions to extract the tables, images and, text from the converted
documents.
"""
import os
import re
from typing import List
from docling.chunking import HybridChunker
from docling_core.types.doc.document import TableItem
from langchain_core.documents import Document
from docling_core.types.doc.labels import DocItemLabel
from docling_core.types.doc.document import TableItem
from transformers import AutoTokenizer
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
__all__ = [
"sanitize_name",
"rename_items",
"find_matching_fig_ref",
"find_image_by_number",
"extract_images",
"extract_tables",
"extract_texts",
"find_relevant_folder"
]
def sanitize_name(name:str)-> str:
"""Replace '-', '_', and '–' with a single hyphen '-' and remove extra spaces.
Args:
name (str): file or folder name
Returns:
str: processed name
"""
# Replace -, _, – with '-'
name = re.sub(r'[-_– ]+', '-', name)
# Replace multiple spaces with a single space
name = re.sub(r'\s+', ' ', name).strip()
return name
def rename_items(directory:str):
"""Rename all files and folders inside the given directory.
Args:
directory (str): file or folder name
"""
items = os.listdir(directory) # Get all files and folders inside the directory
for item in items:
old_path = os.path.join(directory, item)
new_name = sanitize_name(item) # Clean up the name
new_path = os.path.join(directory, new_name)
if old_path != new_path: # Rename only if the name changes
os.rename(old_path, new_path)
print(f"Renamed: {old_path} -> {new_path}")
def find_matching_fig_ref(doc1:dict, doc2:dict)-> str|None:
"""Check the texts ids from text chunks metadata and pictures metadata if any id
matches then returns the image id.
Args:
doc1 (dict): text chunks metadata
doc2 (dict): picture metadata
Returns:
str|None: if similar text id matched in both the metadata then returns the
figure reference which is figure number. if no match None
"""
# Extract and split self_ref and parent_ref into sets
doc1_self_refs = set(doc1['self_ref'].split()) # Split multiple self_refs
doc1_parent_refs = set(doc1['parent_ref'].split()) # Split multiple parent_refs
# Extract text_ref and fig_ref from doc2
doc2_text_ref = doc2['text_ref']
doc2_fig_ref = doc2['fig_ref']
# Check if text_ref exists in self_ref or parent_ref
if doc2_text_ref in doc1_self_refs or doc2_text_ref in doc1_parent_refs:
return doc2_fig_ref # Return fig_ref if there's a match
return None # No match found
def find_image_by_number(folder_path: str, img_number:int)-> str|None:
"""Search for an image with the specified number in the folder.
Args:
folder_path (str): artifacts path where all the images were stored.
img_number (int): image id
Returns:
str|None: image path
"""
pattern = re.compile(rf"image-0*{img_number}-[a-fA-F0-9]+\.png") # Regex pattern
for filename in os.listdir(folder_path):
if pattern.match(filename): # Check if the filename matches the pattern
return os.path.join(folder_path, filename) # Return full path
return None # Return None if no match found
def extract_images(conv_document: Document) -> Document:
"""Extract the images from the converted document and add the metadata.
Args:
conv_document (Document): converted document
Returns:
Document: pictures with the metadata.
"""
pictures: list[Document] = []
for picture in conv_document.pictures:
figure_ref = picture.get_ref().cref
text_ref = picture.parent.get_ref().cref
document = Document(
page_content="",
metadata={
"fig_ref": figure_ref,
"text_ref": text_ref,
},)
pictures.append(document)
return pictures
def extract_tables(document: Document,
file_name: str) -> list[TableItem]:
"""Extract the tables from the converted document and add metadata.
Args:
document (Document): converted document.
file_name (str): file name.
Returns:
list[TableItem]: A list of documents containing table data with
reference IDs in the metadata.
"""
tables = []
for table in document.tables:
if table.label in [DocItemLabel.TABLE]:
self_refs = table.get_ref().cref
parent_refs = table.parent.get_ref().cref if table.parent else ""
text = table.export_to_markdown()
document = Document(
page_content=text,
metadata={
"source": file_name,
"self_ref": self_refs,
"parent_ref": parent_refs,
},
)
tables.append(document)
return tables
def extract_texts(conv_document: Document,
pictures:List[Document],
images_artifacts: str,
embeddings_tokenizer: AutoTokenizer,
file_name: str
)-> List[Document]:
"""Extract the text data from converted document and add the image path in the
metadata.
Args:
conv_document (Document): converted document.
pictures (List[Document]): extracted pictures list.
images_artifacts (str): artifacts path to extact image path.
embeddings_tokenizer (AutoTokenizer): tokenizer to chunk the texts.
file_name (str): file name.
Returns:
List[Document]: chunks with updated metadata.
"""
texts = []
doc_id = 0
for chunk in HybridChunker(tokenizer=embeddings_tokenizer).chunk(conv_document):
items = chunk.meta.doc_items
self_refs = " ".join(map(lambda item: item.get_ref().cref, items))
parent_refs = items[0].parent.get_ref().cref if len(items) > 0 else ""
meta_data_dict = {
"source": file_name,
"self_ref": self_refs,
"parent_ref": parent_refs,
}
for picture in pictures:
fig_metadata = picture.metadata
fig_ref = find_matching_fig_ref(meta_data_dict, fig_metadata)
if fig_ref:
fig_number = int(fig_ref.split("/")[-1])
image_path = find_image_by_number(images_artifacts, fig_number)
meta_data_dict["fig_ref"] = image_path
meta_data_dict["fig_number"] = fig_number
text = chunk.text
document = Document(
page_content=text,
metadata= meta_data_dict,
)
texts.append(document)
return texts
def find_relevant_folder(folder_path:str)->dict:
"""create a dict with markdown file(key) and
artfacts (value).
Args:
folder_path (str): folder path where all the converted documents are stored.
Returns:
dict: dict with file with artifacts folder
"""
# Renaming the files and folders by removing the spaces
rename_items(folder_path)
# Initialize the dataset dictionary
dataset_dict = {}
# Get all files and folders in the directory (do this only once)
all_items = os.listdir(folder_path)
# Split files and folders in one pass
md_files = {file for file in all_items if file.endswith(".md")}
folders = {folder for folder in all_items if not folder.endswith(".md")}
# Create a dictionary of folder name splits for efficient matching
folder_splits = {tuple(folder.split("-")[:-2]): folder for folder in folders}
for file in md_files:
file_split = tuple(file.split("-")[:-1])
# Check if file_split matches any folder's split
if file_split in folder_splits:
dataset_dict[file] = folder_splits[file_split]
return dataset_dict
def extract_ref_text_ids(meta_data):
all_refs = []
# Go through all 3 ref fields
for key in ["self_ref", "parent_ref", "child_ref"]:
ref_str = meta_data.get(key)
if ref_str:
refs = ref_str.split(",") # split in case of multiple refs
all_refs.extend(refs)
# Remove duplicates
unique_refs = set(all_refs)
# Extract /texts/ IDs as integers
text_refs = [int(ref.split("/")[2]) for ref in unique_refs if "/texts/" in ref]
return text_refs