Spaces:

feras-vbrl
/

pdf-to-markdown-converter

Running

App Files Files Community

feras-vbrl commited on 23 days ago

Commit

0f462f7

verified ·

1 Parent(s): 0bb7f78

Upload 5 files

Browse files

Files changed (5) hide show

README.md +26 -13
Spacefile +10 -0
app.py +81 -446
requirements.txt +3 -8
runtime.txt +1 -0

README.md CHANGED Viewed

@@ -1,13 +1,26 @@
----
-title: Pdf To Markdown Converter
-emoji: 🌍
-colorFrom: yellow
-colorTo: yellow
-sdk: streamlit
-sdk_version: 1.44.1
-app_file: app.py
-pinned: false
-short_description: streamlit SmolDoc
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# PDF to Markdown Converter
+This application converts PDF documents to Markdown format. It uses the `docling` library for document conversion and provides a simple Streamlit interface.
+## Features
+- Upload PDF files directly
+- Convert PDFs from URLs
+- Download the resulting Markdown file
+- Clean, user-friendly interface
+## How to Use
+1. Upload a PDF file using the file uploader or enter a URL to a PDF document
+2. Click the "Convert to Markdown" button
+3. Once conversion is complete, download the Markdown file
+## Technical Details
+Built with:
+- Streamlit 1.29.0
+- Docling 2.7.0
+## Deployment
+This application is deployed on Hugging Face Spaces.

Spacefile ADDED Viewed

	@@ -0,0 +1,10 @@

+# Spacefile Docs: https://huggingface.co/docs/hub/spaces-config-reference
+title: PDF to Markdown Converter
+emoji: 📄
+colorFrom: blue
+colorTo: green
+sdk: streamlit
+sdk_version: 1.29.0
+app_file: app.py
+pinned: false
+python_version: 3.12

app.py CHANGED Viewed

@@ -1,302 +1,12 @@
 import streamlit as st
 import tempfile
 import os
-import time
 import logging
-import sys
-from io import BytesIO
-from pathlib import Path
-from urllib.parse import urlparse
-import requests
-from PIL import Image
-import fitz  # PyMuPDF for PDF processing
-# Set environment variables for vLLM
-os.environ["VLLM_LOGGING_LEVEL"] = "DEBUG"
-os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
-os.environ["VLLM_USE_CUDA"] = os.environ.get("VLLM_USE_CUDA", "1")
-try:
-    from vllm import LLM, SamplingParams
-    from docling_core.types.doc import DoclingDocument
-    from docling_core.types.doc.document import DocTagsDocument
-    VLLM_AVAILABLE = True
-except ImportError as e:
-    VLLM_AVAILABLE = False
-    st.error(f"Error importing vLLM or docling_core: {str(e)}")
-    st.info("Falling back to standard Transformers if available.")
-    try:
-        import torch
-        from transformers import AutoProcessor, AutoModelForVision2Seq
-    except ImportError:
-        st.error("Neither vLLM nor Transformers are available. Please check your installation.")
 # Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-logger.info("SmolDocling OCR App starting up...")
-# Set up cache directory
-CACHE_DIR = os.environ.get("CACHE_DIR", "/tmp/smoldocling_cache")
-os.makedirs(CACHE_DIR, exist_ok=True)
-logger.info(f"Cache directory set to: {CACHE_DIR}")
-# Custom DocumentConverter class that uses vLLM for fast inference
-class VLLMDocumentConverter:
-    def __init__(self, model_name="ds4sd/SmolDocling-256M-preview"):
-        """
-        Initialize the converter with vLLM for fast inference
-        Args:
-            model_name: The name of the model to use
-        """
-        logger.info("Loading SmolDocling model with vLLM...")
-        try:
-            # Initialize vLLM with explicit device configuration
-            self.model_path = model_name
-            # Check if CUDA is available through torch
-            cuda_available = False
-            try:
-                import torch
-                cuda_available = torch.cuda.is_available()
-                if cuda_available:
-                    logger.info(f"CUDA is available. Found {torch.cuda.device_count()} device(s).")
-                    for i in range(torch.cuda.device_count()):
-                        logger.info(f"Device {i}: {torch.cuda.get_device_name(i)}")
-                else:
-                    logger.info("CUDA is not available. Using CPU.")
-            except:
-                logger.info("Could not check CUDA availability through torch.")
-            # Print CUDA environment variables for debugging
-            for env_var in os.environ:
-                if "CUDA" in env_var or "VLLM" in env_var:
-                    logger.info(f"Environment variable: {env_var}={os.environ[env_var]}")
-            # Try multiple initialization approaches
-            initialization_methods = [
-                # Method 1: Standard GPU initialization
-                lambda: LLM(
-                    model=self.model_path,
-                    limit_mm_per_prompt={"image": 1},
-                    tensor_parallel_size=1,
-                    dtype="float16",
-                    gpu_memory_utilization=0.7
-                ),
-                # Method 2: With trust_remote_code and enforce_eager
-                lambda: LLM(
-                    model=self.model_path,
-                    limit_mm_per_prompt={"image": 1},
-                    trust_remote_code=True,
-                    enforce_eager=True,
-                    dtype="float16"
-                ),
-                # Method 3: With explicit device specification
-                lambda: LLM(
-                    model=self.model_path,
-                    limit_mm_per_prompt={"image": 1},
-                    dtype="float16",
-                    max_model_len=8192,
-                    device="cuda:0" if cuda_available else "cpu"
-                ),
-                # Method 4: CPU only as last resort
-                lambda: LLM(
-                    model=self.model_path,
-                    limit_mm_per_prompt={"image": 1},
-                    trust_remote_code=True,
-                    enforce_eager=True,
-                    cpu_only=True
-                )
-            ]
-            # Try each initialization method until one works
-            last_error = None
-            for i, init_method in enumerate(initialization_methods):
-                try:
-                    logger.info(f"Trying vLLM initialization method {i+1}/{len(initialization_methods)}")
-                    self.llm = init_method()
-                    logger.info(f"Successfully initialized vLLM with method {i+1}")
-                    break
-                except Exception as e:
-                    last_error = e
-                    logger.warning(f"Method {i+1} failed: {str(e)}")
-                    continue
-            # If all methods failed, raise the last error
-            if not hasattr(self, 'llm'):
-                logger.error("All vLLM initialization methods failed")
-                raise last_error
-            self.sampling_params = SamplingParams(
-                temperature=0.0,
-                max_tokens=8192
-            )
-            logger.info("Model loaded successfully with vLLM")
-        except Exception as e:
-            logger.error(f"Error loading model: {str(e)}")
-            # Print detailed error information
-            import traceback
-            logger.error(traceback.format_exc())
-            raise
-    def load_image_from_path(self, file_path):
-        """Load image from a path, handling both images and PDFs"""
-        logger.debug(f"Loading from path: {file_path}")
-        try:
-            # Check if it's a PDF
-            if file_path.lower().endswith('.pdf'):
-                return self.convert_pdf_to_images(file_path)
-            else:
-                # It's an image
-                pil_image = Image.open(file_path).convert("RGB")
-                logger.debug(f"Image loaded successfully: {pil_image.size}")
-                return [pil_image]  # Return as a list for consistency
-        except Exception as e:
-            logger.error(f"Error loading file: {str(e)}")
-            raise
-    def convert_pdf_to_images(self, pdf_path):
-        """Convert PDF to a list of images"""
-        logger.debug(f"Converting PDF to images: {pdf_path}")
-        try:
-            images = []
-            with fitz.open(pdf_path) as doc:
-                logger.debug(f"PDF has {len(doc)} pages")
-                for page_num, page in enumerate(doc):
-                    logger.debug(f"Processing page {page_num+1}")
-                    # Render page to an image with higher resolution
-                    pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
-                    img_data = pix.tobytes("png")
-                    img = Image.open(BytesIO(img_data)).convert("RGB")
-                    images.append(img)
-            logger.debug(f"Converted {len(images)} pages to images")
-            return images
-        except Exception as e:
-            logger.error(f"Error converting PDF to images: {str(e)}")
-            raise
-    def load_image_from_url(self, url):
-        """Load image from a URL, handling both images and PDFs"""
-        logger.debug(f"Loading from URL: {url}")
-        try:
-            response = requests.get(url, stream=True, timeout=10)
-            response.raise_for_status()
-            # Check if it's a PDF
-            content_type = response.headers.get('Content-Type', '').lower()
-            if content_type == 'application/pdf' or url.lower().endswith('.pdf'):
-                # Save PDF to a temporary file
-                with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
-                    tmp_file.write(response.content)
-                    tmp_path = tmp_file.name
-                try:
-                    # Convert PDF to images
-                    images = self.convert_pdf_to_images(tmp_path)
-                    return images
-                finally:
-                    # Clean up temporary file
-                    if os.path.exists(tmp_path):
-                        os.unlink(tmp_path)
-            else:
-                # It's an image
-                pil_image = Image.open(BytesIO(response.content)).convert("RGB")
-                logger.debug(f"Image loaded successfully: {pil_image.size}")
-                return [pil_image]  # Return as a list for consistency
-        except Exception as e:
-            logger.error(f"Error loading from URL: {str(e)}")
-            raise
-    def process_images(self, images, prompt="Convert page to Docling."):
-        """Process images using vLLM and return doctags outputs"""
-        logger.debug(f"Processing {len(images)} images with prompt: {prompt}")
-        start_time = time.time()
-        all_outputs = []
-        # Create chat template
-        chat_template = f"<|im_start|>User:<image>{prompt}<end_of_utterance>\nAssistant:"
-        # Process each image
-        for i, image in enumerate(images):
-            logger.debug(f"Processing image {i+1} of {len(images)}")
-            # Prepare input for vLLM
-            llm_input = {"prompt": chat_template, "multi_modal_data": {"image": image}}
-            # Generate output
-            output = self.llm.generate([llm_input], sampling_params=self.sampling_params)[0]
-            doctags = output.outputs[0].text
-            all_outputs.append(doctags)
-            logger.debug(f"Generated doctags for image {i+1} (length: {len(doctags)})")
-        logger.debug(f"Total processing time: {time.time() - start_time:.2f} seconds")
-        return all_outputs
-    def convert_to_markdown(self, images, prompt="Convert page to Docling."):
-        """Convert images to markdown using vLLM"""
-        logger.debug(f"Converting {len(images)} images to markdown with prompt: {prompt}")
-        try:
-            # Process images
-            all_outputs = self.process_images(images, prompt)
-            # Populate document with all pages
-            doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(all_outputs, images)
-            # Create a docling document
-            doc = DoclingDocument(name="ConvertedDocument")
-            doc.load_from_doctags(doctags_doc)
-            # Export as markdown
-            markdown_text = doc.export_to_markdown()
-            logger.debug(f"Combined markdown text length: {len(markdown_text)}")
-            return doc
-        except Exception as e:
-            logger.error(f"Error converting to markdown: {str(e)}")
-            raise
-    def convert(self, source, prompt="Convert page to Docling.", max_pages=None):
-        """
-        Convert a PDF/image to markdown
-        Args:
-            source: Either a path to a file or a URL
-            prompt: The prompt to use for conversion
-            max_pages: Maximum number of pages to process
-        Returns:
-            An object with a document attribute that has an export_to_markdown method
-        """
-        logger.debug(f"Converting source: {source}")
-        try:
-            # Check if source is a URL
-            if urlparse(source).scheme != "":
-                images = self.load_image_from_url(source)
-            else:
-                # Check if it's a PDF or image
-                images = self.load_image_from_path(source)
-            # Limit the number of pages if specified
-            if max_pages and max_pages < len(images):
-                logger.debug(f"Limiting processing to {max_pages} pages out of {len(images)}")
-                images = images[:max_pages]
-            # Convert to markdown
-            doc = self.convert_to_markdown(images, prompt)
-            # Return the document
-            return ConversionResult(doc)
-        except Exception as e:
-            logger.error(f"Error in convert method: {str(e)}")
-            raise
-class ConversionResult:
-    """A simple class to mimic the interface of the original DocumentConverter result"""
-    def __init__(self, document):
-        self.document = document
 # Custom CSS for better layout
 st.markdown("""
@@ -333,164 +43,89 @@ st.markdown("""
     </style>
 """, unsafe_allow_html=True)
-def main():
-    logger.info("Starting SmolDocling OCR App main function")
-    st.title("PDF to Markdown Converter")
-    st.subheader("Using SmolDocling OCR with vLLM")
-    # Add a sidebar for model and processing settings
-    st.sidebar.title("Settings")
-    # Model settings
-    st.sidebar.subheader("Model Settings")
-    model_name = st.sidebar.text_input(
-        "Model Name",
-        value="ds4sd/SmolDocling-256M-preview",
-        help="Enter the name of the model to use for PDF to Markdown conversion"
-    )
-    # Processing settings
-    st.sidebar.subheader("Processing Settings")
-    max_pages = st.sidebar.slider(
-        "Maximum Pages to Process",
-        1, 50, 10,
-        help="Limit the number of pages to process for large PDFs"
-    )
-    st.sidebar.markdown("""
-    ### About This App
-    This app uses the SmolDocling model with vLLM for fast inference to convert PDFs and images to Markdown.
-    vLLM is a high-performance library for LLM inference that can significantly speed up processing.
-    """)
-    # Create a button to reload the model if the model name changes
-    reload_model = st.sidebar.button("Reload Model")
-    # Check if vLLM is available
-    if not VLLM_AVAILABLE:
-        st.warning("vLLM is not available. The app will use standard Transformers if available, which may be slower.")
-    # Initialize or reload the converter when needed
-    if 'converter' not in st.session_state or reload_model:
         try:
-            with st.spinner(f"Loading model {model_name}... This may take a while for the first time."):
-                # Display system information for debugging
-                st.sidebar.subheader("System Information")
-                st.sidebar.info(f"Python version: {sys.version}")
-                st.sidebar.info(f"Operating system: {os.name} - {sys.platform}")
-                logger.debug(f"Creating VLLMDocumentConverter instance with model: {model_name}")
-                st.session_state.converter = VLLMDocumentConverter(model_name=model_name)
-                logger.debug("Converter successfully created")
-                st.sidebar.success(f"Model {model_name} loaded successfully!")
-        except Exception as e:
-            error_msg = str(e)
-            logger.error(f"Error creating converter: {error_msg}")
-            st.error(f"Error creating converter: {error_msg}")
-            if 'converter' not in st.session_state:
-                st.stop()
-    # Main upload area
-    uploaded_file = st.file_uploader(
-        "Upload your PDF or image file",
-        type=['pdf', 'png', 'jpg', 'jpeg'],
-        key='file_uploader',
-        help="Drag and drop or click to select a file (max 200MB)"
-    )
-    # URL input area with spacing
-    st.markdown("<br>", unsafe_allow_html=True)
-    url = st.text_input("Or enter a PDF/image URL")
-    # Prompt input
-    prompt = st.text_input("Conversion prompt (optional)", value="Convert page to Docling.")
-    # Unified convert button
-    convert_clicked = st.button("Convert to Markdown", type="primary")
-    # Process either uploaded file or URL
-    if convert_clicked:
-        if uploaded_file is not None:
-            try:
-                with st.spinner('Converting file...'):
-                    with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{uploaded_file.name.split(".")[-1]}') as tmp_file:
-                        tmp_file.write(uploaded_file.getvalue())
-                        tmp_path = tmp_file.name
-                        logger.debug(f"Temporary file created at: {tmp_path}")
-                        try:
-                            logger.debug(f"Converting file: {uploaded_file.name}")
-                            # Convert the file
-                            result = st.session_state.converter.convert(
-                                tmp_path,
-                                prompt=prompt,
-                                max_pages=max_pages
-                            )
-                            markdown_text = result.document.export_to_markdown()
-                            logger.debug(f"Markdown text length: {len(markdown_text)}")
-                            output_filename = os.path.splitext(uploaded_file.name)[0] + '.md'
-                            st.success("Conversion completed!")
-                            st.download_button(
-                                label="Download Markdown file",
-                                data=markdown_text,
-                                file_name=output_filename,
-                                mime="text/markdown"
-                            )
-                            # Display the markdown
-                            st.subheader("Preview:")
-                            st.markdown(markdown_text)
-                        except Exception as e:
-                            logger.error(f"Error converting file: {str(e)}")
-                            st.error(f"Error converting file: {str(e)}")
-                        finally:
-                            if os.path.exists(tmp_path):
-                                os.unlink(tmp_path)
-                                logger.debug("Temporary file deleted")
-            except Exception as e:
-                logger.error(f"Error processing file: {str(e)}")
-                st.error(f"Error processing file: {str(e)}")
-        elif url:
-            try:
-                with st.spinner('Converting from URL...'):
-                    logger.debug(f"Converting from URL: {url}")
-                    # Convert from URL
-                    result = st.session_state.converter.convert(
-                        url,
-                        prompt=prompt,
-                        max_pages=max_pages
-                    )
-                    markdown_text = result.document.export_to_markdown()
-                    logger.debug(f"Markdown text length: {len(markdown_text)}")
-                    output_filename = url.split('/')[-1].split('.')[0] + '.md'
-                    st.success("Conversion completed!")
-                    st.download_button(
-                        label="Download Markdown file",
-                        data=markdown_text,
-                        file_name=output_filename,
-                        mime="text/markdown"
-                    )
-                    # Display the markdown
-                    st.subheader("Preview:")
-                    st.markdown(markdown_text)
-            except Exception as e:
-                logger.error(f"Error converting from URL: {str(e)}")
-                st.error(f"Error converting from URL: {str(e)}")
-        else:
-            st.warning("Please upload a file or enter a URL first")
-if __name__ == "__main__":
-    main()

 import streamlit as st
+from docling.document_converter import DocumentConverter
 import tempfile
 import os
 import logging
 # Configure logging
+logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 # Custom CSS for better layout
 st.markdown("""
     </style>
 """, unsafe_allow_html=True)
+st.title("PDF to Markdown Converter")
+# Initialize session state if it doesn't exist
+if 'converter' not in st.session_state:
+    try:
+        st.session_state.converter = DocumentConverter()
+        logger.debug("Converter successfully created")
+    except Exception as e:
+        logger.error(f"Error creating converter: {str(e)}")
+        st.error(f"Error creating converter: {str(e)}")
+        st.stop()
+# Main upload area
+uploaded_file = st.file_uploader(
+    "Upload your PDF file",
+    type=['pdf'],
+    key='pdf_uploader',
+    help="Drag and drop or click to select a PDF file (max 200MB)"
+)
+# URL input area with spacing
+st.markdown("<br>", unsafe_allow_html=True)
+url = st.text_input("Or enter a PDF URL")
+# Unified convert button
+convert_clicked = st.button("Convert to Markdown", type="primary")
+# Process either uploaded file or URL
+if convert_clicked:
+    if uploaded_file is not None:
         try:
+            with st.spinner('Converting file...'):
+                with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
+                    tmp_file.write(uploaded_file.getvalue())
+                    tmp_path = tmp_file.name
+                    logger.debug(f"Temporary file created at: {tmp_path}")
+                    try:
+                        result = st.session_state.converter.convert(tmp_path)
+                        markdown_text = result.document.export_to_markdown()
+                        output_filename = os.path.splitext(uploaded_file.name)[0] + '.md'
+                        st.success("Conversion completed!")
+                        st.download_button(
+                            label="Download Markdown file",
+                            data=markdown_text,
+                            file_name=output_filename,
+                            mime="text/markdown"
+                        )
+                    except Exception as e:
+                        logger.error(f"Error converting file: {str(e)}")
+                        st.error(f"Error converting file: {str(e)}")
+                    finally:
+                        if os.path.exists(tmp_path):
+                            os.unlink(tmp_path)
+                            logger.debug("Temporary file deleted")
+        except Exception as e:
+            logger.error(f"Error processing file: {str(e)}")
+            st.error(f"Error processing file: {str(e)}")
+    elif url:
+        try:
+            with st.spinner('Converting from URL...'):
+                logger.debug(f"Converting from URL: {url}")
+                result = st.session_state.converter.convert(url)
+                markdown_text = result.document.export_to_markdown()
+                output_filename = url.split('/')[-1].split('.')[0] + '.md'
+                st.success("Conversion completed!")
+                st.download_button(
+                    label="Download Markdown file",
+                    data=markdown_text,
+                    file_name=output_filename,
+                    mime="text/markdown"
+                )
+        except Exception as e:
+            logger.error(f"Error converting from URL: {str(e)}")
+            st.error(f"Error converting from URL: {str(e)}")
+    else:
+        st.warning("Please upload a file or enter a URL first")

requirements.txt CHANGED Viewed

@@ -1,8 +1,3 @@
-streamlit==1.32.0
-torch==2.1.2
-transformers==4.36.2
-Pillow==10.1.0
-PyMuPDF==1.23.8
-requests==2.31.0
-vllm==0.3.0
-docling_core

+streamlit==1.29.0
+docling==2.7.0
+watchdog==2.3.1

runtime.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python-3.12