Spaces:
Running
Running
File size: 3,150 Bytes
5e433de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
"""
In the current version images were not considered in the vector database due to some computing power. so there are the utils to extract references from the retrieved documents and search for pictures. if there is any ref match with picture ref the the images are extracted.
"""
import os
import re
import json
from langchain_core.documents import Document
from docling.document_converter import DocumentConverter
def extract_metadata(documents: list[Document])-> list[str]:
references = []
for doc in documents:
meta_data = doc.metadata
self_ref = meta_data["self_ref"]
parent_ref = meta_data["parent_ref"]
if self_ref:
references.append(self_ref)
if parent_ref:
references.append(parent_ref)
unique_ref = list(set(references))
return unique_ref
def images_data(docling_document: DocumentConverter)-> dict:
images_data = {}
for image in docling_document.pictures:
self_ref = image.self_ref
parent_ref = image.parent.cref
images_data[self_ref] = parent_ref
return images_data
def find_image_by_number(folder_path: str, img_number:int)-> str|None:
"""Search for an image with the specified number in the folder.
Args:
folder_path (str): artifacts path where all the images were stored.
img_number (int): image id
Returns:
str|None: image path
"""
pattern = re.compile(rf"image-0*{img_number}-[a-fA-F0-9]+\.png") # Regex pattern
for filename in os.listdir(folder_path):
if pattern.match(filename): # Check if the filename matches the pattern
return os.path.join(folder_path, filename) # Return full path
return None # Return None if no match found
def extract_matching_pictures(ref_list: list, images_dict:dict) -> list[int]:
def extract_image_numbers(picture_refs):
image_numbers = [int(ref.split('/')[-1]) for ref in picture_refs]
return image_numbers
all_refs = set()
for ref_string in ref_list:
refs = ref_string.split(',')
all_refs.update(refs)
# Find matching picture keys where the image's value (text ref) is in all_refs
matching_pictures = [pic for pic, text_ref in images_dict.items() if text_ref in all_refs]
image_numbers = extract_image_numbers(matching_pictures)
return image_numbers
def extract_ref_paths(images_num_list: list[int])-> list[str]:
folder_path = "/home/kap2403/Desktop/Medico-AI-Bot/converted/ROBBINS-&-COTRAN-PATHOLOGIC-BASIS-OF-DISEASE-10TH-ED-with-image-refs-artifacts"
paths = []
for img_num in images_num_list:
path = find_image_by_number(folder_path = folder_path,
img_number= img_num)
paths.append(path)
return paths
def images_ref_pipeline(retriever):
with open(r"/home/kap2403/Desktop/Medico-AI-Bot/dataset/pictures.json", "r") as file:
images_data = json.load(file)
meta_data = extract_metadata(retriever)
image_numbers = extract_matching_pictures(meta_data, images_data)
paths_list = extract_ref_paths(image_numbers)
return paths_list
|