File size: 7,481 Bytes
5e433de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
"""
Script to convert all the pdf documents to markdown format in azure.
"""

import logging
import time
from pathlib import Path
import os
import yaml
from azureml.fsspec import AzureMachineLearningFileSystem
import shutil

from concurrent.futures import ThreadPoolExecutor
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import ImageRefMode
from huggingface_hub import snapshot_download
from docling.datamodel.settings import settings


from docling.datamodel.pipeline_options import (
            AcceleratorDevice,
            AcceleratorOptions,
            PdfPipelineOptions,
            TesseractCliOcrOptions,
            TableFormerMode,
        )


from indexing import document_indexing
from docling_utils import save_json


logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

class Docling_Coversion:
    def __init__(self, image_scale=1.0):
        logging.info("Initializing Docling_Coversion with image_scale=%s", image_scale)
        accelerator_options = AcceleratorOptions(
                                num_threads=8, device=AcceleratorDevice.CUDA
                            )

        # Turn on inline debug visualizations:
        settings.debug.visualize_layout = True
        settings.debug.visualize_ocr = True
        settings.debug.visualize_tables = True
        settings.debug.visualize_cells = True


        pipeline_options = PdfPipelineOptions(
            do_ocr=True,
            do_table_structure=True,
            images_scale=image_scale,
            generate_page_images=True,
            generate_picture_images=True,
            accelerator_options=accelerator_options,
            ocr_options=TesseractCliOcrOptions(force_full_page_ocr=True)
        )

        pipeline_options.table_structure_options.do_cell_matching = True
        pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE

        self.converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_options=pipeline_options,
                    backend=DoclingParseV4DocumentBackend,
                )
            }
        )
        logging.info("Docling_Coversion initialized successfully.")

    def document_conversion(self, file_path):
        """Convert a file and return the document object."""
        logging.info("Starting document conversion for file: %s", file_path)
        return self.converter.convert(Path(file_path)).document

    def save_document(self, file_path, output_dir, azure_fs):
        """Convert a file, save the output as markdown with embedded images, 
           and upload to Azure."""
        input_path = Path(file_path)
        logging.info("Processing file: %s", file_path)

        try:
            result = self.converter.convert(input_path)
            doc_name = input_path.stem
            temp_md_file_path = Path(output_dir) / f"{doc_name}-with-images.md"

            docling_document_class = document_indexing(result, 
                                           "ibm-granite/granite-embedding-125m-english",
                                           speciality= input_path.parent.name,
                                           file_name=input_path.stem
                                           )
            tables_doc = docling_document_class.extract_tables()
            images_doc = docling_document_class.extract_images()
            text_doc = docling_document_class.extract_all_text()
            chunks_doc = docling_document_class.create_chunks()

            # Save the extracted data as JSON
            save_json(file_path=output_dir, category="tables", data=tables_doc)
            save_json(file_path=output_dir, category="images", data=images_doc)
            save_json(file_path=output_dir, category="text", data=text_doc)
            save_json(file_path=output_dir, category="chunks", data=chunks_doc)
            logging.info("Saved extracted data as JSON files.")
            
            
            # Save locally first
            result.document.save_as_markdown(temp_md_file_path, image_mode=ImageRefMode.REFERENCED)
            logging.info("Saved locally: %s", temp_md_file_path)

            # Upload to Azure
            azure_output_path = f"converted_docs_json/{doc_name}"
            azure_fs.upload(lpath=str(output_dir), rpath=azure_output_path, recursive=True)
            logging.info("Uploaded to Azure: %s", azure_output_path)

            # Optionally, delete the local file after upload
            if output_dir.exists() and output_dir.is_dir():
                shutil.rmtree(output_dir)
                logging.info("Deleted local directory: %s", output_dir)

        except Exception as e:
            logging.error("Error processing file %s: %s", file_path, e)




def main(source_dir: str):
    
    logging.info("Starting main function with source_dir: %s", source_dir)
    
    # Set the temporary output directory

    # Set the local directory to save PDFs
    local_pdf_dir = Path("./local_pdfs")
    local_pdf_dir.mkdir(parents=True, exist_ok=True)  # Create the directory if it doesn't exist
    logging.info("Local PDF directory created: %s", local_pdf_dir)

    fs = AzureMachineLearningFileSystem(source_dir)
    all_pdf_files = fs.glob('**/*.pdf')  
    logging.info("Found %d PDF files in source directory.", len(all_pdf_files))

    converter = Docling_Coversion(image_scale=2)

    for file_path in all_pdf_files:
        # file_path = Path(file_path)
        output_dir = Path("./temp")
        output_dir.mkdir(parents=True, exist_ok=True)  # Create the directory if it doesn't exist
        logging.info("Temporary output directory created: %s", output_dir)

        file_path_ = Path(file_path)
        file_name = file_path_.name
        local_pdf_path = local_pdf_dir / file_name
        azure_output_path = f"converted_docs_json/{file_path_.stem}"

        # Check if the file already exists in Azure
        if fs.exists(azure_output_path):
            logging.info("Skipping %s, already processed.", file_name)
            continue

        # Save the PDF locally
        logging.info("Downloading file: %s", file_name)
        with fs.open(file_path, "rb") as remote_file:
            with open(local_pdf_path, "wb") as local_file:
                local_file.write(remote_file.read())
        logging.info("File saved locally: %s", local_pdf_path)

        # Process the local PDF file
        logging.info("Processing: %s", file_name)
        converter.save_document(local_pdf_path, output_dir, fs)

        # Optionally, delete the local PDF after processing
        local_pdf_path.unlink()
        logging.info("Deleted local PDF: %s", local_pdf_path)

    logging.info("Processing completed for all files.")


if __name__ == "__main__":
    logging.info("Script started.")
    main(source_dir=(
        'azureml://subscriptions/485363cd-687d-4adb-a30b-35108c11d682/resourcegroups/medbot/workspaces/karthik/datastores/workspaceartifactstore/paths/UI/2025-04-11_075006_UTC/PdfFiles/'
    ))
    logging.info("Script finished.")