Spaces:
Running
Running
File size: 7,481 Bytes
5e433de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
"""
Script to convert all the pdf documents to markdown format in azure.
"""
import logging
import time
from pathlib import Path
import os
import yaml
from azureml.fsspec import AzureMachineLearningFileSystem
import shutil
from concurrent.futures import ThreadPoolExecutor
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import ImageRefMode
from huggingface_hub import snapshot_download
from docling.datamodel.settings import settings
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
PdfPipelineOptions,
TesseractCliOcrOptions,
TableFormerMode,
)
from indexing import document_indexing
from docling_utils import save_json
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
class Docling_Coversion:
def __init__(self, image_scale=1.0):
logging.info("Initializing Docling_Coversion with image_scale=%s", image_scale)
accelerator_options = AcceleratorOptions(
num_threads=8, device=AcceleratorDevice.CUDA
)
# Turn on inline debug visualizations:
settings.debug.visualize_layout = True
settings.debug.visualize_ocr = True
settings.debug.visualize_tables = True
settings.debug.visualize_cells = True
pipeline_options = PdfPipelineOptions(
do_ocr=True,
do_table_structure=True,
images_scale=image_scale,
generate_page_images=True,
generate_picture_images=True,
accelerator_options=accelerator_options,
ocr_options=TesseractCliOcrOptions(force_full_page_ocr=True)
)
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
self.converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=DoclingParseV4DocumentBackend,
)
}
)
logging.info("Docling_Coversion initialized successfully.")
def document_conversion(self, file_path):
"""Convert a file and return the document object."""
logging.info("Starting document conversion for file: %s", file_path)
return self.converter.convert(Path(file_path)).document
def save_document(self, file_path, output_dir, azure_fs):
"""Convert a file, save the output as markdown with embedded images,
and upload to Azure."""
input_path = Path(file_path)
logging.info("Processing file: %s", file_path)
try:
result = self.converter.convert(input_path)
doc_name = input_path.stem
temp_md_file_path = Path(output_dir) / f"{doc_name}-with-images.md"
docling_document_class = document_indexing(result,
"ibm-granite/granite-embedding-125m-english",
speciality= input_path.parent.name,
file_name=input_path.stem
)
tables_doc = docling_document_class.extract_tables()
images_doc = docling_document_class.extract_images()
text_doc = docling_document_class.extract_all_text()
chunks_doc = docling_document_class.create_chunks()
# Save the extracted data as JSON
save_json(file_path=output_dir, category="tables", data=tables_doc)
save_json(file_path=output_dir, category="images", data=images_doc)
save_json(file_path=output_dir, category="text", data=text_doc)
save_json(file_path=output_dir, category="chunks", data=chunks_doc)
logging.info("Saved extracted data as JSON files.")
# Save locally first
result.document.save_as_markdown(temp_md_file_path, image_mode=ImageRefMode.REFERENCED)
logging.info("Saved locally: %s", temp_md_file_path)
# Upload to Azure
azure_output_path = f"converted_docs_json/{doc_name}"
azure_fs.upload(lpath=str(output_dir), rpath=azure_output_path, recursive=True)
logging.info("Uploaded to Azure: %s", azure_output_path)
# Optionally, delete the local file after upload
if output_dir.exists() and output_dir.is_dir():
shutil.rmtree(output_dir)
logging.info("Deleted local directory: %s", output_dir)
except Exception as e:
logging.error("Error processing file %s: %s", file_path, e)
def main(source_dir: str):
logging.info("Starting main function with source_dir: %s", source_dir)
# Set the temporary output directory
# Set the local directory to save PDFs
local_pdf_dir = Path("./local_pdfs")
local_pdf_dir.mkdir(parents=True, exist_ok=True) # Create the directory if it doesn't exist
logging.info("Local PDF directory created: %s", local_pdf_dir)
fs = AzureMachineLearningFileSystem(source_dir)
all_pdf_files = fs.glob('**/*.pdf')
logging.info("Found %d PDF files in source directory.", len(all_pdf_files))
converter = Docling_Coversion(image_scale=2)
for file_path in all_pdf_files:
# file_path = Path(file_path)
output_dir = Path("./temp")
output_dir.mkdir(parents=True, exist_ok=True) # Create the directory if it doesn't exist
logging.info("Temporary output directory created: %s", output_dir)
file_path_ = Path(file_path)
file_name = file_path_.name
local_pdf_path = local_pdf_dir / file_name
azure_output_path = f"converted_docs_json/{file_path_.stem}"
# Check if the file already exists in Azure
if fs.exists(azure_output_path):
logging.info("Skipping %s, already processed.", file_name)
continue
# Save the PDF locally
logging.info("Downloading file: %s", file_name)
with fs.open(file_path, "rb") as remote_file:
with open(local_pdf_path, "wb") as local_file:
local_file.write(remote_file.read())
logging.info("File saved locally: %s", local_pdf_path)
# Process the local PDF file
logging.info("Processing: %s", file_name)
converter.save_document(local_pdf_path, output_dir, fs)
# Optionally, delete the local PDF after processing
local_pdf_path.unlink()
logging.info("Deleted local PDF: %s", local_pdf_path)
logging.info("Processing completed for all files.")
if __name__ == "__main__":
logging.info("Script started.")
main(source_dir=(
'azureml://subscriptions/485363cd-687d-4adb-a30b-35108c11d682/resourcegroups/medbot/workspaces/karthik/datastores/workspaceartifactstore/paths/UI/2025-04-11_075006_UTC/PdfFiles/'
))
logging.info("Script finished.") |