kap2403
"added files"
5e433de
"""
create chunks and create clusters usign raptor architecture.
"""
import json
import os
import itertools
import logging
from uuid import uuid4
from docling.document_converter import DocumentConverter
from docling_core.experimental.serializer.markdown import MarkdownTableSerializer
from docling_core.transforms.chunker.hierarchical_chunker import ChunkingDocSerializer
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from docling_core.types.doc.document import DoclingDocument
from docling_core.types.doc.labels import DocItemLabel
from langchain_core.documents import Document
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from transformers import AutoTokenizer
# imports from another scripts
def adding_metadata_chunks(chunks: HybridChunker, file_name: str, speciality: str) -> list[Document]:
"""Adding metadata to the chunks
This function processes a list of chunks and adds metadata to each chunk.
Args:
chunks (Hybridchunker): The chunks to be processed.
file_name (str): The name of the file from which the chunks were created.
specality (str): specalization of the book.
Returns:
List[Document]: A list of Document objects with added metadata.
"""
documents = []
for idx, chunk in enumerate(chunks):
items = chunk.meta.doc_items
if len(items) == 1 and isinstance(items[0], TableItem):
# If the chunk is a table, we can skip it
continue
main_ref = " ".join([item.get_ref().cref for item in items])
parent_ref = " ".join([item.parent.get_ref().cref for item in items])
child_ref = " ".join([str(child) for sublist in [item.children for item in items] for child in sublist])
text = chunk.text # The text of the chunk
metadata = {
"source": file_name,
"specilization": speciality,
"chunk_index": idx,
"self_ref": main_ref,
"parent_ref": parent_ref,
"child_ref": child_ref,
"chunk_type": "text",
}
document = Document(page_content=text, metadata=metadata)
documents.append(document)
return documents
class document_indexing:
def __init__(self,
docling_converted_document: DocumentConverter,
embeddings_model: str,
speciality: str,
file_name: str):
# convert the document
self.converted_document = docling_converted_document.document
# hybrid chunking
self.embeddings_tokenizer = AutoTokenizer.from_pretrained(embeddings_model)
self.speciality = speciality
self.file_name = file_name
def create_chunks(self):
chunks = HybridChunker(tokenizer=self.embeddings_tokenizer).chunk(self.converted_document)
updated_chunks = adding_metadata_chunks(chunks = chunks,
file_name = self.file_name ,
speciality = self.speciality)
return updated_chunks
def extract_all_text(self) -> list[Document]:
"""To exract all the text from the docling document and convert it to langchain
document. This is useful for creating a vector store from the text.
Args:
docling_document (DocumentConverter): _docling_document_
file_name (str): name of the file
medical_specialty (str): book category
Returns:
list[Document]: _list of langchain documents_
"""
documents_list = list()
for text in self.converted_document.texts:
content = text.text
main_ref = ",".join([text.get_ref().cref])
parent_ref = ",".join([text.parent.get_ref().cref])
child_ref = ",".join([ref.get_ref().cref for ref in text.children])
document = Document(page_content=content, metadata={
"source": self.file_name,
"chunk_index": None,
"self_ref": main_ref,
"parent_ref": parent_ref,
"child_ref": child_ref,
"chunk_type": "text",
"medical_specialty" : self.speciality,
"reference": None
})
documents_list.append(document)
return documents_list
def extract_tables(self) -> list[Document]:
"""Extract the tables from the converted document and add metadata.
Args:
document (DocumentConverter): converted document.
file_name (str): file name.
medical_specialty (str): book category
Returns:
list[TableItem]: A list of documents containing table data with
reference IDs in the metadata.
"""
tables: list[Document] = []
for table in self.converted_document.tables:
if table.label in [DocItemLabel.TABLE]:
main_ref = ",".join([table.get_ref().cref])
parent_ref = ",".join([table.parent.get_ref().cref])
child_ref = ",".join([ref.get_ref().cref for ref in table.children])
text = table.export_to_markdown()
metadata = {
"source": self.file_name,
"chunk_index": None,
"self_ref": main_ref,
"parent_ref": parent_ref,
"child_ref": child_ref,
"chunk_type": "table",
"medical_specialty" : self.speciality,
}
document = Document(page_content=text, metadata=metadata)
tables.append(document)
return tables
def extract_images(self) -> list[Document]:
"""Extract the tables from the converted document and add metadata.
Args:
document (DocumentConverter): converted document.
file_name (str): file name.
medical_specialty (str): book category
Returns:
list[TableItem]: A list of documents containing table data with
reference IDs in the metadata.
"""
images: list[Document] = []
for picture in self.converted_document.pictures:
if picture.label in [DocItemLabel.PICTURE]:
main_ref = ",".join([picture.get_ref().cref])
parent_ref = ",".join([picture.parent.get_ref().cref])
child_ref = ",".join([ref.get_ref().cref for ref in picture.children])
metadata = {
"source": self.file_name,
"chunk_index": None,
"self_ref": main_ref,
"parent_ref": parent_ref,
"child_ref": child_ref,
"chunk_type": "table",
"medical_specialty" : self.speciality,
}
document = Document(page_content=main_ref, metadata=metadata)
images.append(document)
return images