Spaces:
Running
Running
""" | |
create chunks and create clusters usign raptor architecture. | |
""" | |
import json | |
import os | |
import itertools | |
import logging | |
from uuid import uuid4 | |
from docling.document_converter import DocumentConverter | |
from docling_core.experimental.serializer.markdown import MarkdownTableSerializer | |
from docling_core.transforms.chunker.hierarchical_chunker import ChunkingDocSerializer | |
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker | |
from docling_core.types.doc.document import DoclingDocument | |
from docling_core.types.doc.labels import DocItemLabel | |
from langchain_core.documents import Document | |
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem | |
from transformers import AutoTokenizer | |
# imports from another scripts | |
def adding_metadata_chunks(chunks: HybridChunker, file_name: str, speciality: str) -> list[Document]: | |
"""Adding metadata to the chunks | |
This function processes a list of chunks and adds metadata to each chunk. | |
Args: | |
chunks (Hybridchunker): The chunks to be processed. | |
file_name (str): The name of the file from which the chunks were created. | |
specality (str): specalization of the book. | |
Returns: | |
List[Document]: A list of Document objects with added metadata. | |
""" | |
documents = [] | |
for idx, chunk in enumerate(chunks): | |
items = chunk.meta.doc_items | |
if len(items) == 1 and isinstance(items[0], TableItem): | |
# If the chunk is a table, we can skip it | |
continue | |
main_ref = " ".join([item.get_ref().cref for item in items]) | |
parent_ref = " ".join([item.parent.get_ref().cref for item in items]) | |
child_ref = " ".join([str(child) for sublist in [item.children for item in items] for child in sublist]) | |
text = chunk.text # The text of the chunk | |
metadata = { | |
"source": file_name, | |
"specilization": speciality, | |
"chunk_index": idx, | |
"self_ref": main_ref, | |
"parent_ref": parent_ref, | |
"child_ref": child_ref, | |
"chunk_type": "text", | |
} | |
document = Document(page_content=text, metadata=metadata) | |
documents.append(document) | |
return documents | |
class document_indexing: | |
def __init__(self, | |
docling_converted_document: DocumentConverter, | |
embeddings_model: str, | |
speciality: str, | |
file_name: str): | |
# convert the document | |
self.converted_document = docling_converted_document.document | |
# hybrid chunking | |
self.embeddings_tokenizer = AutoTokenizer.from_pretrained(embeddings_model) | |
self.speciality = speciality | |
self.file_name = file_name | |
def create_chunks(self): | |
chunks = HybridChunker(tokenizer=self.embeddings_tokenizer).chunk(self.converted_document) | |
updated_chunks = adding_metadata_chunks(chunks = chunks, | |
file_name = self.file_name , | |
speciality = self.speciality) | |
return updated_chunks | |
def extract_all_text(self) -> list[Document]: | |
"""To exract all the text from the docling document and convert it to langchain | |
document. This is useful for creating a vector store from the text. | |
Args: | |
docling_document (DocumentConverter): _docling_document_ | |
file_name (str): name of the file | |
medical_specialty (str): book category | |
Returns: | |
list[Document]: _list of langchain documents_ | |
""" | |
documents_list = list() | |
for text in self.converted_document.texts: | |
content = text.text | |
main_ref = ",".join([text.get_ref().cref]) | |
parent_ref = ",".join([text.parent.get_ref().cref]) | |
child_ref = ",".join([ref.get_ref().cref for ref in text.children]) | |
document = Document(page_content=content, metadata={ | |
"source": self.file_name, | |
"chunk_index": None, | |
"self_ref": main_ref, | |
"parent_ref": parent_ref, | |
"child_ref": child_ref, | |
"chunk_type": "text", | |
"medical_specialty" : self.speciality, | |
"reference": None | |
}) | |
documents_list.append(document) | |
return documents_list | |
def extract_tables(self) -> list[Document]: | |
"""Extract the tables from the converted document and add metadata. | |
Args: | |
document (DocumentConverter): converted document. | |
file_name (str): file name. | |
medical_specialty (str): book category | |
Returns: | |
list[TableItem]: A list of documents containing table data with | |
reference IDs in the metadata. | |
""" | |
tables: list[Document] = [] | |
for table in self.converted_document.tables: | |
if table.label in [DocItemLabel.TABLE]: | |
main_ref = ",".join([table.get_ref().cref]) | |
parent_ref = ",".join([table.parent.get_ref().cref]) | |
child_ref = ",".join([ref.get_ref().cref for ref in table.children]) | |
text = table.export_to_markdown() | |
metadata = { | |
"source": self.file_name, | |
"chunk_index": None, | |
"self_ref": main_ref, | |
"parent_ref": parent_ref, | |
"child_ref": child_ref, | |
"chunk_type": "table", | |
"medical_specialty" : self.speciality, | |
} | |
document = Document(page_content=text, metadata=metadata) | |
tables.append(document) | |
return tables | |
def extract_images(self) -> list[Document]: | |
"""Extract the tables from the converted document and add metadata. | |
Args: | |
document (DocumentConverter): converted document. | |
file_name (str): file name. | |
medical_specialty (str): book category | |
Returns: | |
list[TableItem]: A list of documents containing table data with | |
reference IDs in the metadata. | |
""" | |
images: list[Document] = [] | |
for picture in self.converted_document.pictures: | |
if picture.label in [DocItemLabel.PICTURE]: | |
main_ref = ",".join([picture.get_ref().cref]) | |
parent_ref = ",".join([picture.parent.get_ref().cref]) | |
child_ref = ",".join([ref.get_ref().cref for ref in picture.children]) | |
metadata = { | |
"source": self.file_name, | |
"chunk_index": None, | |
"self_ref": main_ref, | |
"parent_ref": parent_ref, | |
"child_ref": child_ref, | |
"chunk_type": "table", | |
"medical_specialty" : self.speciality, | |
} | |
document = Document(page_content=main_ref, metadata=metadata) | |
images.append(document) | |
return images | |