medico-bot / src /data_preprocessing /docling /document_conversion.py
kap2403
"added files"
5e433de
"""
Script to convert all the pdf documents to markdown format in azure.
"""
import logging
import time
from pathlib import Path
import os
import yaml
from azureml.fsspec import AzureMachineLearningFileSystem
import shutil
from concurrent.futures import ThreadPoolExecutor
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import ImageRefMode
from huggingface_hub import snapshot_download
from docling.datamodel.settings import settings
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
PdfPipelineOptions,
TesseractCliOcrOptions,
TableFormerMode,
)
from indexing import document_indexing
from docling_utils import save_json
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
class Docling_Coversion:
def __init__(self, image_scale=1.0):
logging.info("Initializing Docling_Coversion with image_scale=%s", image_scale)
accelerator_options = AcceleratorOptions(
num_threads=8, device=AcceleratorDevice.CUDA
)
# Turn on inline debug visualizations:
settings.debug.visualize_layout = True
settings.debug.visualize_ocr = True
settings.debug.visualize_tables = True
settings.debug.visualize_cells = True
pipeline_options = PdfPipelineOptions(
do_ocr=True,
do_table_structure=True,
images_scale=image_scale,
generate_page_images=True,
generate_picture_images=True,
accelerator_options=accelerator_options,
ocr_options=TesseractCliOcrOptions(force_full_page_ocr=True)
)
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
self.converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=DoclingParseV4DocumentBackend,
)
}
)
logging.info("Docling_Coversion initialized successfully.")
def document_conversion(self, file_path):
"""Convert a file and return the document object."""
logging.info("Starting document conversion for file: %s", file_path)
return self.converter.convert(Path(file_path)).document
def save_document(self, file_path, output_dir, azure_fs):
"""Convert a file, save the output as markdown with embedded images,
and upload to Azure."""
input_path = Path(file_path)
logging.info("Processing file: %s", file_path)
try:
result = self.converter.convert(input_path)
doc_name = input_path.stem
temp_md_file_path = Path(output_dir) / f"{doc_name}-with-images.md"
docling_document_class = document_indexing(result,
"ibm-granite/granite-embedding-125m-english",
speciality= input_path.parent.name,
file_name=input_path.stem
)
tables_doc = docling_document_class.extract_tables()
images_doc = docling_document_class.extract_images()
text_doc = docling_document_class.extract_all_text()
chunks_doc = docling_document_class.create_chunks()
# Save the extracted data as JSON
save_json(file_path=output_dir, category="tables", data=tables_doc)
save_json(file_path=output_dir, category="images", data=images_doc)
save_json(file_path=output_dir, category="text", data=text_doc)
save_json(file_path=output_dir, category="chunks", data=chunks_doc)
logging.info("Saved extracted data as JSON files.")
# Save locally first
result.document.save_as_markdown(temp_md_file_path, image_mode=ImageRefMode.REFERENCED)
logging.info("Saved locally: %s", temp_md_file_path)
# Upload to Azure
azure_output_path = f"converted_docs_json/{doc_name}"
azure_fs.upload(lpath=str(output_dir), rpath=azure_output_path, recursive=True)
logging.info("Uploaded to Azure: %s", azure_output_path)
# Optionally, delete the local file after upload
if output_dir.exists() and output_dir.is_dir():
shutil.rmtree(output_dir)
logging.info("Deleted local directory: %s", output_dir)
except Exception as e:
logging.error("Error processing file %s: %s", file_path, e)
def main(source_dir: str):
logging.info("Starting main function with source_dir: %s", source_dir)
# Set the temporary output directory
# Set the local directory to save PDFs
local_pdf_dir = Path("./local_pdfs")
local_pdf_dir.mkdir(parents=True, exist_ok=True) # Create the directory if it doesn't exist
logging.info("Local PDF directory created: %s", local_pdf_dir)
fs = AzureMachineLearningFileSystem(source_dir)
all_pdf_files = fs.glob('**/*.pdf')
logging.info("Found %d PDF files in source directory.", len(all_pdf_files))
converter = Docling_Coversion(image_scale=2)
for file_path in all_pdf_files:
# file_path = Path(file_path)
output_dir = Path("./temp")
output_dir.mkdir(parents=True, exist_ok=True) # Create the directory if it doesn't exist
logging.info("Temporary output directory created: %s", output_dir)
file_path_ = Path(file_path)
file_name = file_path_.name
local_pdf_path = local_pdf_dir / file_name
azure_output_path = f"converted_docs_json/{file_path_.stem}"
# Check if the file already exists in Azure
if fs.exists(azure_output_path):
logging.info("Skipping %s, already processed.", file_name)
continue
# Save the PDF locally
logging.info("Downloading file: %s", file_name)
with fs.open(file_path, "rb") as remote_file:
with open(local_pdf_path, "wb") as local_file:
local_file.write(remote_file.read())
logging.info("File saved locally: %s", local_pdf_path)
# Process the local PDF file
logging.info("Processing: %s", file_name)
converter.save_document(local_pdf_path, output_dir, fs)
# Optionally, delete the local PDF after processing
local_pdf_path.unlink()
logging.info("Deleted local PDF: %s", local_pdf_path)
logging.info("Processing completed for all files.")
if __name__ == "__main__":
logging.info("Script started.")
main(source_dir=(
'azureml://subscriptions/485363cd-687d-4adb-a30b-35108c11d682/resourcegroups/medbot/workspaces/karthik/datastores/workspaceartifactstore/paths/UI/2025-04-11_075006_UTC/PdfFiles/'
))
logging.info("Script finished.")