Spaces:

feras-vbrl
/

pdf-to-markdown-converter

Running

App Files Files Community

feras-vbrl commited on 23 days ago

Commit

6a7a825

verified ·

1 Parent(s): b4e5504

Create app.py

Browse files

Files changed (1) hide show

app.py +393 -0

app.py ADDED Viewed

	@@ -0,0 +1,393 @@

+import streamlit as st
+import tempfile
+import os
+import time
+import logging
+from io import BytesIO
+from pathlib import Path
+from urllib.parse import urlparse
+import requests
+from PIL import Image
+import fitz  # PyMuPDF for PDF processing
+from vllm import LLM, SamplingParams
+from docling_core.types.doc import DoclingDocument
+from docling_core.types.doc.document import DocTagsDocument
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+logger.info("SmolDocling OCR App starting up...")
+# Set up cache directory
+CACHE_DIR = os.environ.get("CACHE_DIR", "/tmp/smoldocling_cache")
+os.makedirs(CACHE_DIR, exist_ok=True)
+logger.info(f"Cache directory set to: {CACHE_DIR}")
+# Custom DocumentConverter class that uses vLLM for fast inference
+class VLLMDocumentConverter:
+    def __init__(self, model_name="ds4sd/SmolDocling-256M-preview"):
+        """
+        Initialize the converter with vLLM for fast inference
+        Args:
+            model_name: The name of the model to use
+        """
+        logger.info("Loading SmolDocling model with vLLM...")
+        try:
+            # Initialize vLLM
+            self.model_path = model_name
+            self.llm = LLM(model=self.model_path, limit_mm_per_prompt={"image": 1})
+            self.sampling_params = SamplingParams(
+                temperature=0.0,
+                max_tokens=8192
+            )
+            logger.info("Model loaded successfully with vLLM")
+        except Exception as e:
+            logger.error(f"Error loading model: {str(e)}")
+            raise
+    def load_image_from_path(self, file_path):
+        """Load image from a path, handling both images and PDFs"""
+        logger.debug(f"Loading from path: {file_path}")
+        try:
+            # Check if it's a PDF
+            if file_path.lower().endswith('.pdf'):
+                return self.convert_pdf_to_images(file_path)
+            else:
+                # It's an image
+                pil_image = Image.open(file_path).convert("RGB")
+                logger.debug(f"Image loaded successfully: {pil_image.size}")
+                return [pil_image]  # Return as a list for consistency
+        except Exception as e:
+            logger.error(f"Error loading file: {str(e)}")
+            raise
+    def convert_pdf_to_images(self, pdf_path):
+        """Convert PDF to a list of images"""
+        logger.debug(f"Converting PDF to images: {pdf_path}")
+        try:
+            images = []
+            with fitz.open(pdf_path) as doc:
+                logger.debug(f"PDF has {len(doc)} pages")
+                for page_num, page in enumerate(doc):
+                    logger.debug(f"Processing page {page_num+1}")
+                    # Render page to an image with higher resolution
+                    pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
+                    img_data = pix.tobytes("png")
+                    img = Image.open(BytesIO(img_data)).convert("RGB")
+                    images.append(img)
+            logger.debug(f"Converted {len(images)} pages to images")
+            return images
+        except Exception as e:
+            logger.error(f"Error converting PDF to images: {str(e)}")
+            raise
+    def load_image_from_url(self, url):
+        """Load image from a URL, handling both images and PDFs"""
+        logger.debug(f"Loading from URL: {url}")
+        try:
+            response = requests.get(url, stream=True, timeout=10)
+            response.raise_for_status()
+            # Check if it's a PDF
+            content_type = response.headers.get('Content-Type', '').lower()
+            if content_type == 'application/pdf' or url.lower().endswith('.pdf'):
+                # Save PDF to a temporary file
+                with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
+                    tmp_file.write(response.content)
+                    tmp_path = tmp_file.name
+                try:
+                    # Convert PDF to images
+                    images = self.convert_pdf_to_images(tmp_path)
+                    return images
+                finally:
+                    # Clean up temporary file
+                    if os.path.exists(tmp_path):
+                        os.unlink(tmp_path)
+            else:
+                # It's an image
+                pil_image = Image.open(BytesIO(response.content)).convert("RGB")
+                logger.debug(f"Image loaded successfully: {pil_image.size}")
+                return [pil_image]  # Return as a list for consistency
+        except Exception as e:
+            logger.error(f"Error loading from URL: {str(e)}")
+            raise
+    def process_images(self, images, prompt="Convert page to Docling."):
+        """Process images using vLLM and return doctags outputs"""
+        logger.debug(f"Processing {len(images)} images with prompt: {prompt}")
+        start_time = time.time()
+        all_outputs = []
+        # Create chat template
+        chat_template = f"<|im_start|>User:<image>{prompt}<end_of_utterance>\nAssistant:"
+        # Process each image
+        for i, image in enumerate(images):
+            logger.debug(f"Processing image {i+1} of {len(images)}")
+            # Prepare input for vLLM
+            llm_input = {"prompt": chat_template, "multi_modal_data": {"image": image}}
+            # Generate output
+            output = self.llm.generate([llm_input], sampling_params=self.sampling_params)[0]
+            doctags = output.outputs[0].text
+            all_outputs.append(doctags)
+            logger.debug(f"Generated doctags for image {i+1} (length: {len(doctags)})")
+        logger.debug(f"Total processing time: {time.time() - start_time:.2f} seconds")
+        return all_outputs
+    def convert_to_markdown(self, images, prompt="Convert page to Docling."):
+        """Convert images to markdown using vLLM"""
+        logger.debug(f"Converting {len(images)} images to markdown with prompt: {prompt}")
+        try:
+            # Process images
+            all_outputs = self.process_images(images, prompt)
+            # Populate document with all pages
+            doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(all_outputs, images)
+            # Create a docling document
+            doc = DoclingDocument(name="ConvertedDocument")
+            doc.load_from_doctags(doctags_doc)
+            # Export as markdown
+            markdown_text = doc.export_to_markdown()
+            logger.debug(f"Combined markdown text length: {len(markdown_text)}")
+            return doc
+        except Exception as e:
+            logger.error(f"Error converting to markdown: {str(e)}")
+            raise
+    def convert(self, source, prompt="Convert page to Docling.", max_pages=None):
+        """
+        Convert a PDF/image to markdown
+        Args:
+            source: Either a path to a file or a URL
+            prompt: The prompt to use for conversion
+            max_pages: Maximum number of pages to process
+        Returns:
+            An object with a document attribute that has an export_to_markdown method
+        """
+        logger.debug(f"Converting source: {source}")
+        try:
+            # Check if source is a URL
+            if urlparse(source).scheme != "":
+                images = self.load_image_from_url(source)
+            else:
+                # Check if it's a PDF or image
+                images = self.load_image_from_path(source)
+            # Limit the number of pages if specified
+            if max_pages and max_pages < len(images):
+                logger.debug(f"Limiting processing to {max_pages} pages out of {len(images)}")
+                images = images[:max_pages]
+            # Convert to markdown
+            doc = self.convert_to_markdown(images, prompt)
+            # Return the document
+            return ConversionResult(doc)
+        except Exception as e:
+            logger.error(f"Error in convert method: {str(e)}")
+            raise
+class ConversionResult:
+    """A simple class to mimic the interface of the original DocumentConverter result"""
+    def __init__(self, document):
+        self.document = document
+# Custom CSS for better layout
+st.markdown("""
+    <style>
+        .stFileUploader {
+            padding: 1rem;
+        }
+        button[data-testid="stFileUploaderButtonPrimary"] {
+            background-color: #000660 !important;
+            border: none !important;
+            color: white !important;
+        }
+        .stButton button {
+            background-color: #006666;
+            border: none !important;
+            color: white;
+            padding: 0.5rem 2rem !important;
+        }
+        .stButton button:hover {
+            background-color: #008080 !important;
+            color: white !important;
+            border-color: #008080 !important;
+        }
+        .upload-text {
+            font-size: 1.2rem;
+            margin-bottom: 1rem;
+        }
+        div[data-testid="stFileUploadDropzone"]:hover {
+            border-color: #006666 !important;
+            background-color: rgba(0, 102, 102, 0.05) !important;
+        }
+    </style>
+""", unsafe_allow_html=True)
+def main():
+    logger.info("Starting SmolDocling OCR App main function")
+    st.title("PDF to Markdown Converter")
+    st.subheader("Using SmolDocling OCR with vLLM")
+    # Add a sidebar for model and processing settings
+    st.sidebar.title("Settings")
+    # Model settings
+    st.sidebar.subheader("Model Settings")
+    model_name = st.sidebar.text_input(
+        "Model Name",
+        value="ds4sd/SmolDocling-256M-preview",
+        help="Enter the name of the model to use for PDF to Markdown conversion"
+    )
+    # Processing settings
+    st.sidebar.subheader("Processing Settings")
+    max_pages = st.sidebar.slider(
+        "Maximum Pages to Process",
+        1, 50, 10,
+        help="Limit the number of pages to process for large PDFs"
+    )
+    st.sidebar.markdown("""
+    ### About This App
+    This app uses the SmolDocling model with vLLM for fast inference to convert PDFs and images to Markdown.
+    vLLM is a high-performance library for LLM inference that can significantly speed up processing.
+    """)
+    # Create a button to reload the model if the model name changes
+    reload_model = st.sidebar.button("Reload Model")
+    # Initialize or reload the converter when needed
+    if 'converter' not in st.session_state or reload_model:
+        try:
+            with st.spinner(f"Loading model {model_name}... This may take a while for the first time."):
+                logger.debug(f"Creating VLLMDocumentConverter instance with model: {model_name}")
+                st.session_state.converter = VLLMDocumentConverter(model_name=model_name)
+                logger.debug("Converter successfully created")
+                st.sidebar.success(f"Model {model_name} loaded successfully!")
+        except Exception as e:
+            error_msg = str(e)
+            logger.error(f"Error creating converter: {error_msg}")
+            st.error(f"Error creating converter: {error_msg}")
+            if 'converter' not in st.session_state:
+                st.stop()
+    # Main upload area
+    uploaded_file = st.file_uploader(
+        "Upload your PDF or image file",
+        type=['pdf', 'png', 'jpg', 'jpeg'],
+        key='file_uploader',
+        help="Drag and drop or click to select a file (max 200MB)"
+    )
+    # URL input area with spacing
+    st.markdown("<br>", unsafe_allow_html=True)
+    url = st.text_input("Or enter a PDF/image URL")
+    # Prompt input
+    prompt = st.text_input("Conversion prompt (optional)", value="Convert page to Docling.")
+    # Unified convert button
+    convert_clicked = st.button("Convert to Markdown", type="primary")
+    # Process either uploaded file or URL
+    if convert_clicked:
+        if uploaded_file is not None:
+            try:
+                with st.spinner('Converting file...'):
+                    with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{uploaded_file.name.split(".")[-1]}') as tmp_file:
+                        tmp_file.write(uploaded_file.getvalue())
+                        tmp_path = tmp_file.name
+                        logger.debug(f"Temporary file created at: {tmp_path}")
+                        try:
+                            logger.debug(f"Converting file: {uploaded_file.name}")
+                            # Convert the file
+                            result = st.session_state.converter.convert(
+                                tmp_path,
+                                prompt=prompt,
+                                max_pages=max_pages
+                            )
+                            markdown_text = result.document.export_to_markdown()
+                            logger.debug(f"Markdown text length: {len(markdown_text)}")
+                            output_filename = os.path.splitext(uploaded_file.name)[0] + '.md'
+                            st.success("Conversion completed!")
+                            st.download_button(
+                                label="Download Markdown file",
+                                data=markdown_text,
+                                file_name=output_filename,
+                                mime="text/markdown"
+                            )
+                            # Display the markdown
+                            st.subheader("Preview:")
+                            st.markdown(markdown_text)
+                        except Exception as e:
+                            logger.error(f"Error converting file: {str(e)}")
+                            st.error(f"Error converting file: {str(e)}")
+                        finally:
+                            if os.path.exists(tmp_path):
+                                os.unlink(tmp_path)
+                                logger.debug("Temporary file deleted")
+            except Exception as e:
+                logger.error(f"Error processing file: {str(e)}")
+                st.error(f"Error processing file: {str(e)}")
+        elif url:
+            try:
+                with st.spinner('Converting from URL...'):
+                    logger.debug(f"Converting from URL: {url}")
+                    # Convert from URL
+                    result = st.session_state.converter.convert(
+                        url,
+                        prompt=prompt,
+                        max_pages=max_pages
+                    )
+                    markdown_text = result.document.export_to_markdown()
+                    logger.debug(f"Markdown text length: {len(markdown_text)}")
+                    output_filename = url.split('/')[-1].split('.')[0] + '.md'
+                    st.success("Conversion completed!")
+                    st.download_button(
+                        label="Download Markdown file",
+                        data=markdown_text,
+                        file_name=output_filename,
+                        mime="text/markdown"
+                    )
+                    # Display the markdown
+                    st.subheader("Preview:")
+                    st.markdown(markdown_text)
+            except Exception as e:
+                logger.error(f"Error converting from URL: {str(e)}")
+                st.error(f"Error converting from URL: {str(e)}")
+        else:
+            st.warning("Please upload a file or enter a URL first")
+if __name__ == "__main__":
+    main()