Spaces:

pasupuletkarthiksai
/

medico-bot

Running

medico-bot / src /data_preprocessing /docling /utils.py

kap2403

"added files"

5e433de 22 days ago

8.57 kB

	"""
	contains all the functions to extract the tables, images and, text from the converted
	documents.
	"""

	import os
	import re

	from typing import List

	from docling.chunking import HybridChunker
	from docling_core.types.doc.document import TableItem
	from langchain_core.documents import Document
	from docling_core.types.doc.labels import DocItemLabel

	from docling_core.types.doc.document import TableItem
	from transformers import AutoTokenizer
	from docling_core.transforms.chunker.hybrid_chunker import HybridChunker

	__all__ = [
	"sanitize_name",
	"rename_items",
	"find_matching_fig_ref",
	"find_image_by_number",
	"extract_images",
	"extract_tables",
	"extract_texts",
	"find_relevant_folder"
	]


	def sanitize_name(name:str)-> str:
	"""Replace '-', '_', and '–' with a single hyphen '-' and remove extra spaces.

	Args:
	name (str): file or folder name

	Returns:
	str: processed name
	"""
	# Replace -, _, – with '-'
	name = re.sub(r'[-_– ]+', '-', name)
	# Replace multiple spaces with a single space
	name = re.sub(r'\s+', ' ', name).strip()
	return name

	def rename_items(directory:str):
	"""Rename all files and folders inside the given directory.

	Args:
	directory (str): file or folder name
	"""
	items = os.listdir(directory) # Get all files and folders inside the directory
	for item in items:
	old_path = os.path.join(directory, item)
	new_name = sanitize_name(item) # Clean up the name
	new_path = os.path.join(directory, new_name)

	if old_path != new_path: # Rename only if the name changes
	os.rename(old_path, new_path)
	print(f"Renamed: {old_path} -> {new_path}")

	def find_matching_fig_ref(doc1:dict, doc2:dict)-> str\|None:
	"""Check the texts ids from text chunks metadata and pictures metadata if any id
	matches then returns the image id.

	Args:
	doc1 (dict): text chunks metadata
	doc2 (dict): picture metadata

	Returns:
	str\|None: if similar text id matched in both the metadata then returns the
	figure reference which is figure number. if no match None
	"""

	# Extract and split self_ref and parent_ref into sets
	doc1_self_refs = set(doc1['self_ref'].split()) # Split multiple self_refs
	doc1_parent_refs = set(doc1['parent_ref'].split()) # Split multiple parent_refs

	# Extract text_ref and fig_ref from doc2
	doc2_text_ref = doc2['text_ref']
	doc2_fig_ref = doc2['fig_ref']

	# Check if text_ref exists in self_ref or parent_ref
	if doc2_text_ref in doc1_self_refs or doc2_text_ref in doc1_parent_refs:
	return doc2_fig_ref # Return fig_ref if there's a match
	return None # No match found

	def find_image_by_number(folder_path: str, img_number:int)-> str\|None:
	"""Search for an image with the specified number in the folder.

	Args:
	folder_path (str): artifacts path where all the images were stored.
	img_number (int): image id

	Returns:
	str\|None: image path
	"""

	pattern = re.compile(rf"image-0*{img_number}-[a-fA-F0-9]+\.png") # Regex pattern

	for filename in os.listdir(folder_path):
	if pattern.match(filename): # Check if the filename matches the pattern
	return os.path.join(folder_path, filename) # Return full path

	return None # Return None if no match found

	def extract_images(conv_document: Document) -> Document:
	"""Extract the images from the converted document and add the metadata.

	Args:
	conv_document (Document): converted document

	Returns:
	Document: pictures with the metadata.
	"""

	pictures: list[Document] = []
	for picture in conv_document.pictures:
	figure_ref = picture.get_ref().cref
	text_ref = picture.parent.get_ref().cref
	document = Document(
	page_content="",
	metadata={
	"fig_ref": figure_ref,
	"text_ref": text_ref,
	},)
	pictures.append(document)
	return pictures

	def extract_tables(document: Document,
	file_name: str) -> list[TableItem]:
	"""Extract the tables from the converted document and add metadata.

	Args:
	document (Document): converted document.
	file_name (str): file name.

	Returns:
	list[TableItem]: A list of documents containing table data with
	reference IDs in the metadata.
	"""
	tables = []
	for table in document.tables:
	if table.label in [DocItemLabel.TABLE]:

	self_refs = table.get_ref().cref
	parent_refs = table.parent.get_ref().cref if table.parent else ""

	text = table.export_to_markdown()
	document = Document(
	page_content=text,
	metadata={
	"source": file_name,
	"self_ref": self_refs,
	"parent_ref": parent_refs,

	},
	)
	tables.append(document)
	return tables

	def extract_texts(conv_document: Document,
	pictures:List[Document],
	images_artifacts: str,
	embeddings_tokenizer: AutoTokenizer,
	file_name: str
	)-> List[Document]:
	"""Extract the text data from converted document and add the image path in the
	metadata.

	Args:
	conv_document (Document): converted document.
	pictures (List[Document]): extracted pictures list.
	images_artifacts (str): artifacts path to extact image path.
	embeddings_tokenizer (AutoTokenizer): tokenizer to chunk the texts.
	file_name (str): file name.

	Returns:
	List[Document]: chunks with updated metadata.
	"""
	texts = []
	doc_id = 0
	for chunk in HybridChunker(tokenizer=embeddings_tokenizer).chunk(conv_document):
	items = chunk.meta.doc_items
	self_refs = " ".join(map(lambda item: item.get_ref().cref, items))
	parent_refs = items[0].parent.get_ref().cref if len(items) > 0 else ""
	meta_data_dict = {
	"source": file_name,
	"self_ref": self_refs,
	"parent_ref": parent_refs,
	}

	for picture in pictures:
	fig_metadata = picture.metadata
	fig_ref = find_matching_fig_ref(meta_data_dict, fig_metadata)
	if fig_ref:
	fig_number = int(fig_ref.split("/")[-1])
	image_path = find_image_by_number(images_artifacts, fig_number)
	meta_data_dict["fig_ref"] = image_path
	meta_data_dict["fig_number"] = fig_number

	text = chunk.text
	document = Document(
	page_content=text,
	metadata= meta_data_dict,
	)
	texts.append(document)
	return texts




	def find_relevant_folder(folder_path:str)->dict:
	"""create a dict with markdown file(key) and
	artfacts (value).

	Args:
	folder_path (str): folder path where all the converted documents are stored.

	Returns:
	dict: dict with file with artifacts folder
	"""
	# Renaming the files and folders by removing the spaces
	rename_items(folder_path)

	# Initialize the dataset dictionary
	dataset_dict = {}

	# Get all files and folders in the directory (do this only once)
	all_items = os.listdir(folder_path)

	# Split files and folders in one pass
	md_files = {file for file in all_items if file.endswith(".md")}
	folders = {folder for folder in all_items if not folder.endswith(".md")}

	# Create a dictionary of folder name splits for efficient matching
	folder_splits = {tuple(folder.split("-")[:-2]): folder for folder in folders}

	for file in md_files:
	file_split = tuple(file.split("-")[:-1])

	# Check if file_split matches any folder's split
	if file_split in folder_splits:
	dataset_dict[file] = folder_splits[file_split]

	return dataset_dict


	def extract_ref_text_ids(meta_data):
	all_refs = []

	# Go through all 3 ref fields
	for key in ["self_ref", "parent_ref", "child_ref"]:
	ref_str = meta_data.get(key)
	if ref_str:
	refs = ref_str.split(",") # split in case of multiple refs
	all_refs.extend(refs)

	# Remove duplicates
	unique_refs = set(all_refs)

	# Extract /texts/ IDs as integers
	text_refs = [int(ref.split("/")[2]) for ref in unique_refs if "/texts/" in ref]

	return text_refs