Spaces:
Running
Running
""" | |
contains all the functions to extract the tables, images and, text from the converted | |
documents. | |
""" | |
import os | |
import re | |
from typing import List | |
from docling.chunking import HybridChunker | |
from docling_core.types.doc.document import TableItem | |
from langchain_core.documents import Document | |
from docling_core.types.doc.labels import DocItemLabel | |
from docling_core.types.doc.document import TableItem | |
from transformers import AutoTokenizer | |
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker | |
__all__ = [ | |
"sanitize_name", | |
"rename_items", | |
"find_matching_fig_ref", | |
"find_image_by_number", | |
"extract_images", | |
"extract_tables", | |
"extract_texts", | |
"find_relevant_folder" | |
] | |
def sanitize_name(name:str)-> str: | |
"""Replace '-', '_', and '–' with a single hyphen '-' and remove extra spaces. | |
Args: | |
name (str): file or folder name | |
Returns: | |
str: processed name | |
""" | |
# Replace -, _, – with '-' | |
name = re.sub(r'[-_– ]+', '-', name) | |
# Replace multiple spaces with a single space | |
name = re.sub(r'\s+', ' ', name).strip() | |
return name | |
def rename_items(directory:str): | |
"""Rename all files and folders inside the given directory. | |
Args: | |
directory (str): file or folder name | |
""" | |
items = os.listdir(directory) # Get all files and folders inside the directory | |
for item in items: | |
old_path = os.path.join(directory, item) | |
new_name = sanitize_name(item) # Clean up the name | |
new_path = os.path.join(directory, new_name) | |
if old_path != new_path: # Rename only if the name changes | |
os.rename(old_path, new_path) | |
print(f"Renamed: {old_path} -> {new_path}") | |
def find_matching_fig_ref(doc1:dict, doc2:dict)-> str|None: | |
"""Check the texts ids from text chunks metadata and pictures metadata if any id | |
matches then returns the image id. | |
Args: | |
doc1 (dict): text chunks metadata | |
doc2 (dict): picture metadata | |
Returns: | |
str|None: if similar text id matched in both the metadata then returns the | |
figure reference which is figure number. if no match None | |
""" | |
# Extract and split self_ref and parent_ref into sets | |
doc1_self_refs = set(doc1['self_ref'].split()) # Split multiple self_refs | |
doc1_parent_refs = set(doc1['parent_ref'].split()) # Split multiple parent_refs | |
# Extract text_ref and fig_ref from doc2 | |
doc2_text_ref = doc2['text_ref'] | |
doc2_fig_ref = doc2['fig_ref'] | |
# Check if text_ref exists in self_ref or parent_ref | |
if doc2_text_ref in doc1_self_refs or doc2_text_ref in doc1_parent_refs: | |
return doc2_fig_ref # Return fig_ref if there's a match | |
return None # No match found | |
def find_image_by_number(folder_path: str, img_number:int)-> str|None: | |
"""Search for an image with the specified number in the folder. | |
Args: | |
folder_path (str): artifacts path where all the images were stored. | |
img_number (int): image id | |
Returns: | |
str|None: image path | |
""" | |
pattern = re.compile(rf"image-0*{img_number}-[a-fA-F0-9]+\.png") # Regex pattern | |
for filename in os.listdir(folder_path): | |
if pattern.match(filename): # Check if the filename matches the pattern | |
return os.path.join(folder_path, filename) # Return full path | |
return None # Return None if no match found | |
def extract_images(conv_document: Document) -> Document: | |
"""Extract the images from the converted document and add the metadata. | |
Args: | |
conv_document (Document): converted document | |
Returns: | |
Document: pictures with the metadata. | |
""" | |
pictures: list[Document] = [] | |
for picture in conv_document.pictures: | |
figure_ref = picture.get_ref().cref | |
text_ref = picture.parent.get_ref().cref | |
document = Document( | |
page_content="", | |
metadata={ | |
"fig_ref": figure_ref, | |
"text_ref": text_ref, | |
},) | |
pictures.append(document) | |
return pictures | |
def extract_tables(document: Document, | |
file_name: str) -> list[TableItem]: | |
"""Extract the tables from the converted document and add metadata. | |
Args: | |
document (Document): converted document. | |
file_name (str): file name. | |
Returns: | |
list[TableItem]: A list of documents containing table data with | |
reference IDs in the metadata. | |
""" | |
tables = [] | |
for table in document.tables: | |
if table.label in [DocItemLabel.TABLE]: | |
self_refs = table.get_ref().cref | |
parent_refs = table.parent.get_ref().cref if table.parent else "" | |
text = table.export_to_markdown() | |
document = Document( | |
page_content=text, | |
metadata={ | |
"source": file_name, | |
"self_ref": self_refs, | |
"parent_ref": parent_refs, | |
}, | |
) | |
tables.append(document) | |
return tables | |
def extract_texts(conv_document: Document, | |
pictures:List[Document], | |
images_artifacts: str, | |
embeddings_tokenizer: AutoTokenizer, | |
file_name: str | |
)-> List[Document]: | |
"""Extract the text data from converted document and add the image path in the | |
metadata. | |
Args: | |
conv_document (Document): converted document. | |
pictures (List[Document]): extracted pictures list. | |
images_artifacts (str): artifacts path to extact image path. | |
embeddings_tokenizer (AutoTokenizer): tokenizer to chunk the texts. | |
file_name (str): file name. | |
Returns: | |
List[Document]: chunks with updated metadata. | |
""" | |
texts = [] | |
doc_id = 0 | |
for chunk in HybridChunker(tokenizer=embeddings_tokenizer).chunk(conv_document): | |
items = chunk.meta.doc_items | |
self_refs = " ".join(map(lambda item: item.get_ref().cref, items)) | |
parent_refs = items[0].parent.get_ref().cref if len(items) > 0 else "" | |
meta_data_dict = { | |
"source": file_name, | |
"self_ref": self_refs, | |
"parent_ref": parent_refs, | |
} | |
for picture in pictures: | |
fig_metadata = picture.metadata | |
fig_ref = find_matching_fig_ref(meta_data_dict, fig_metadata) | |
if fig_ref: | |
fig_number = int(fig_ref.split("/")[-1]) | |
image_path = find_image_by_number(images_artifacts, fig_number) | |
meta_data_dict["fig_ref"] = image_path | |
meta_data_dict["fig_number"] = fig_number | |
text = chunk.text | |
document = Document( | |
page_content=text, | |
metadata= meta_data_dict, | |
) | |
texts.append(document) | |
return texts | |
def find_relevant_folder(folder_path:str)->dict: | |
"""create a dict with markdown file(key) and | |
artfacts (value). | |
Args: | |
folder_path (str): folder path where all the converted documents are stored. | |
Returns: | |
dict: dict with file with artifacts folder | |
""" | |
# Renaming the files and folders by removing the spaces | |
rename_items(folder_path) | |
# Initialize the dataset dictionary | |
dataset_dict = {} | |
# Get all files and folders in the directory (do this only once) | |
all_items = os.listdir(folder_path) | |
# Split files and folders in one pass | |
md_files = {file for file in all_items if file.endswith(".md")} | |
folders = {folder for folder in all_items if not folder.endswith(".md")} | |
# Create a dictionary of folder name splits for efficient matching | |
folder_splits = {tuple(folder.split("-")[:-2]): folder for folder in folders} | |
for file in md_files: | |
file_split = tuple(file.split("-")[:-1]) | |
# Check if file_split matches any folder's split | |
if file_split in folder_splits: | |
dataset_dict[file] = folder_splits[file_split] | |
return dataset_dict | |
def extract_ref_text_ids(meta_data): | |
all_refs = [] | |
# Go through all 3 ref fields | |
for key in ["self_ref", "parent_ref", "child_ref"]: | |
ref_str = meta_data.get(key) | |
if ref_str: | |
refs = ref_str.split(",") # split in case of multiple refs | |
all_refs.extend(refs) | |
# Remove duplicates | |
unique_refs = set(all_refs) | |
# Extract /texts/ IDs as integers | |
text_refs = [int(ref.split("/")[2]) for ref in unique_refs if "/texts/" in ref] | |
return text_refs |