""" Script to convert all the pdf documents to markdown format in azure. """ import logging import time from pathlib import Path import os import yaml from azureml.fsspec import AzureMachineLearningFileSystem import shutil from concurrent.futures import ThreadPoolExecutor from docling_core.types.doc import ImageRefMode, PictureItem, TableItem from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.settings import settings from docling.document_converter import DocumentConverter, PdfFormatOption from docling_core.types.doc import ImageRefMode from huggingface_hub import snapshot_download from docling.datamodel.settings import settings from docling.datamodel.pipeline_options import ( AcceleratorDevice, AcceleratorOptions, PdfPipelineOptions, TesseractCliOcrOptions, TableFormerMode, ) from indexing import document_indexing from docling_utils import save_json logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") class Docling_Coversion: def __init__(self, image_scale=1.0): logging.info("Initializing Docling_Coversion with image_scale=%s", image_scale) accelerator_options = AcceleratorOptions( num_threads=8, device=AcceleratorDevice.CUDA ) # Turn on inline debug visualizations: settings.debug.visualize_layout = True settings.debug.visualize_ocr = True settings.debug.visualize_tables = True settings.debug.visualize_cells = True pipeline_options = PdfPipelineOptions( do_ocr=True, do_table_structure=True, images_scale=image_scale, generate_page_images=True, generate_picture_images=True, accelerator_options=accelerator_options, ocr_options=TesseractCliOcrOptions(force_full_page_ocr=True) ) pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE self.converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options, backend=DoclingParseV4DocumentBackend, ) } ) logging.info("Docling_Coversion initialized successfully.") def document_conversion(self, file_path): """Convert a file and return the document object.""" logging.info("Starting document conversion for file: %s", file_path) return self.converter.convert(Path(file_path)).document def save_document(self, file_path, output_dir, azure_fs): """Convert a file, save the output as markdown with embedded images, and upload to Azure.""" input_path = Path(file_path) logging.info("Processing file: %s", file_path) try: result = self.converter.convert(input_path) doc_name = input_path.stem temp_md_file_path = Path(output_dir) / f"{doc_name}-with-images.md" docling_document_class = document_indexing(result, "ibm-granite/granite-embedding-125m-english", speciality= input_path.parent.name, file_name=input_path.stem ) tables_doc = docling_document_class.extract_tables() images_doc = docling_document_class.extract_images() text_doc = docling_document_class.extract_all_text() chunks_doc = docling_document_class.create_chunks() # Save the extracted data as JSON save_json(file_path=output_dir, category="tables", data=tables_doc) save_json(file_path=output_dir, category="images", data=images_doc) save_json(file_path=output_dir, category="text", data=text_doc) save_json(file_path=output_dir, category="chunks", data=chunks_doc) logging.info("Saved extracted data as JSON files.") # Save locally first result.document.save_as_markdown(temp_md_file_path, image_mode=ImageRefMode.REFERENCED) logging.info("Saved locally: %s", temp_md_file_path) # Upload to Azure azure_output_path = f"converted_docs_json/{doc_name}" azure_fs.upload(lpath=str(output_dir), rpath=azure_output_path, recursive=True) logging.info("Uploaded to Azure: %s", azure_output_path) # Optionally, delete the local file after upload if output_dir.exists() and output_dir.is_dir(): shutil.rmtree(output_dir) logging.info("Deleted local directory: %s", output_dir) except Exception as e: logging.error("Error processing file %s: %s", file_path, e) def main(source_dir: str): logging.info("Starting main function with source_dir: %s", source_dir) # Set the temporary output directory # Set the local directory to save PDFs local_pdf_dir = Path("./local_pdfs") local_pdf_dir.mkdir(parents=True, exist_ok=True) # Create the directory if it doesn't exist logging.info("Local PDF directory created: %s", local_pdf_dir) fs = AzureMachineLearningFileSystem(source_dir) all_pdf_files = fs.glob('**/*.pdf') logging.info("Found %d PDF files in source directory.", len(all_pdf_files)) converter = Docling_Coversion(image_scale=2) for file_path in all_pdf_files: # file_path = Path(file_path) output_dir = Path("./temp") output_dir.mkdir(parents=True, exist_ok=True) # Create the directory if it doesn't exist logging.info("Temporary output directory created: %s", output_dir) file_path_ = Path(file_path) file_name = file_path_.name local_pdf_path = local_pdf_dir / file_name azure_output_path = f"converted_docs_json/{file_path_.stem}" # Check if the file already exists in Azure if fs.exists(azure_output_path): logging.info("Skipping %s, already processed.", file_name) continue # Save the PDF locally logging.info("Downloading file: %s", file_name) with fs.open(file_path, "rb") as remote_file: with open(local_pdf_path, "wb") as local_file: local_file.write(remote_file.read()) logging.info("File saved locally: %s", local_pdf_path) # Process the local PDF file logging.info("Processing: %s", file_name) converter.save_document(local_pdf_path, output_dir, fs) # Optionally, delete the local PDF after processing local_pdf_path.unlink() logging.info("Deleted local PDF: %s", local_pdf_path) logging.info("Processing completed for all files.") if __name__ == "__main__": logging.info("Script started.") main(source_dir=( 'azureml://subscriptions/485363cd-687d-4adb-a30b-35108c11d682/resourcegroups/medbot/workspaces/karthik/datastores/workspaceartifactstore/paths/UI/2025-04-11_075006_UTC/PdfFiles/' )) logging.info("Script finished.")