Spaces:
Running
on
Zero
Running
on
Zero
from langchain_docling import DoclingLoader | |
from langchain_docling.loader import ExportType | |
# Import required classes for building a custom converter | |
from docling.document_converter import DocumentConverter, PdfFormatOption, InputFormat | |
from docling.datamodel.pipeline_options import PdfPipelineOptions | |
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend | |
import spaces | |
def convert_to_markdown(file_objs, url, do_ocr, do_table_structure): | |
file_path = file_objs if file_objs is not None else url | |
pipeline_options = PdfPipelineOptions() | |
pipeline_options.do_ocr = do_ocr | |
pipeline_options.do_table_structure = do_table_structure | |
pdf_format_options = PdfFormatOption( | |
pipeline_options=pipeline_options, | |
backend=PyPdfiumDocumentBackend, | |
) | |
doc_converter = DocumentConverter( | |
allowed_formats=[InputFormat.PDF], | |
format_options={ | |
InputFormat.PDF: pdf_format_options | |
} | |
) | |
# Pass the custom converter to the DoclingLoader. | |
loader = DoclingLoader( | |
file_path=file_path, | |
export_type=ExportType.MARKDOWN, | |
converter=doc_converter | |
) | |
docs = loader.load() | |
return docs[0].page_content | |