Spaces:

marcosremar2
/

docker_mineru

Sleeping

App Files Files Community

marcosremar2 commited on 14 days ago

Commit

a49c5dc

1 Parent(s): 5f5a1d2

Update PDF to Markdown converter API with NVIDIA L4 support

Browse files

Files changed (3) hide show

Dockerfile +35 -22
app/main.py +73 -41
pdf_converter/convert_pdf_to_md.py +47 -25

Dockerfile CHANGED Viewed

@@ -20,7 +20,9 @@ RUN apt-get update && \
         libxrender1 \
         libsm6 \
         libxext6 \
-        poppler-utils && \
     rm -rf /var/lib/apt/lists/*
 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
@@ -29,33 +31,44 @@ RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
 RUN useradd -m -u 1000 user
 # Create necessary directories and set permissions
-RUN mkdir -p /app /app/docker_mineru /app/docker_mineru/output /app/docker_mineru/output/images && \
-    chown -R user:user /app
-# Switch to user
-USER user
-# Set home directory
-ENV HOME=/home/user \
-    PATH=/home/user/.local/bin:$PATH
-# Set working directory
-WORKDIR /app
-# Copy requirements first (with correct ownership)
-COPY --chown=user requirements.txt .
-# Install PyTorch dependencies with explicit compatible versions for NVIDIA L4
-RUN pip3 install --no-cache-dir --upgrade pip && \
-    pip3 install --no-cache-dir torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu121 && \
-    pip3 install --no-cache-dir transformers==4.36.2 && \
-    pip3 install --no-cache-dir -r requirements.txt
-# Copy the rest of the application with correct ownership
-COPY --chown=user . .
-# Expose port
 EXPOSE 7860
-# Command to run the application
-CMD ["python3", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

         libxrender1 \
         libsm6 \
         libxext6 \
+        poppler-utils \
+        libjpeg-dev \
+        libpng-dev && \
     rm -rf /var/lib/apt/lists/*
 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
 RUN useradd -m -u 1000 user
 # Create necessary directories and set permissions
+RUN mkdir -p /app /app/docker_mineru/output/images /home/user/.cache/huggingface /home/user/.cache/torch && \
+    chown -R user:user /app /home/user
+WORKDIR /app
+# Copy requirements first
+COPY --chown=user:user requirements.txt .
+# Upgrade pip and install PyTorch dependencies first
+# Use versions compatible with CUDA 12.1 and L40S
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir \
+        torch==2.1.2 \
+        torchvision==0.16.2 \
+        torchaudio==2.1.2 \
+        --extra-index-url https://download.pytorch.org/whl/cu121
+# Install other requirements including gunicorn
+RUN pip install --no-cache-dir -r requirements.txt && \
+    pip install --no-cache-dir gunicorn
+# Copy the rest of the application code
+COPY --chown=user:user . .
+# Ensure output directory exists and has correct permissions (redundant but safe)
+RUN mkdir -p /app/docker_mineru/output/images && \
+    chown -R user:user /app/docker_mineru/output
+# Set the user
+USER user
+# Environment variables for caching (optional, might help with model downloads)
+ENV HF_HOME=/home/user/.cache/huggingface
+ENV TORCH_HOME=/home/user/.cache/torch
+# Expose the port
 EXPOSE 7860
+# Command to run the application with Gunicorn and Uvicorn workers
+# Start with 4 workers. Adjust based on monitoring L40S resources.
+CMD ["gunicorn", "-w", "4", "-k", "uvicorn.workers.UvicornWorker", "app.main:app", "--bind", "0.0.0.0:7860"]

app/main.py CHANGED Viewed

@@ -10,49 +10,76 @@ from datetime import datetime
 from typing import Dict, Any
 import shutil
 import torch
 # Add the parent directory to sys.path to import convert_pdf_to_md
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from pdf_converter import convert_pdf_to_md
 # --- Configuration for output directory ---
 # In Docker container, use /app prefix
-output_dir = "/app/docker_mineru/output"
 images_dir = os.path.join(output_dir, "images")
 # Create output directory if it doesn't exist
 os.makedirs(output_dir, exist_ok=True)
 os.makedirs(images_dir, exist_ok=True)
 # --- End Configuration ---
 # Application metadata
 app_description = """
-# PDF to Markdown Converter API
 This API provides PDF processing capabilities to convert PDF documents to Markdown format using marker.
 ## Features:
 - PDF to Markdown conversion using marker
 - Simple API interface
 """
 app = FastAPI(
     title="PDF to Markdown API",
     description=app_description,
-    version="1.0.0",
 )
-# Add CORS middleware to allow cross-origin requests
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],  # Allow all origins
     allow_credentials=True,
-    allow_methods=["*"],  # Allow all methods
-    allow_headers=["*"],  # Allow all headers
 )
-# Mount the output directory as static files
 app.mount("/output", StaticFiles(directory=output_dir), name="output")
 # Health check endpoint
@@ -73,75 +100,80 @@ async def health_check() -> Dict[str, Any]:
         "status": "healthy",
         "timestamp": datetime.now().isoformat(),
         "service": "pdf-to-markdown-converter",
-        "gpu": gpu_info
     }
 @app.post("/convert", tags=["PDF Processing"])
 async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
     """
-    Convert a PDF file to markdown using marker.
     Parameters:
         file: The PDF file to process
     Returns:
-        A JSON object containing the conversion result and markdown content
     """
     if not file.filename or not file.filename.lower().endswith('.pdf'):
-        raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
     content = await file.read()
     temp_pdf_path = None
     try:
-        # Save the uploaded PDF to a temporary file
-        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
             temp_pdf.write(content)
             temp_pdf_path = temp_pdf.name
-        # Get the base name of the file
         filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0]
-        # Use the configured output_dir
         output_md_file = os.path.join(output_dir, f"{filename_without_ext}.md")
-        # Process the PDF using marker
-        md_content = convert_pdf_to_md.convert_pdf(temp_pdf_path, output_md_file)
-        # Construct the relative path for the response
         relative_output_path = os.path.join("/output", f"{filename_without_ext}.md")
         return {
             "filename": file.filename,
             "status": "success",
-            "markdown_content": md_content,
-            "output_file": relative_output_path
         }
     except Exception as e:
         error_detail = str(e)
         error_trace = traceback.format_exc()
-        # Log the error
-        print(f"Error processing PDF: {error_detail}")
         print(error_trace)
         return JSONResponse(
-            status_code=500,
             content={
                 "error": "Error processing PDF",
                 "detail": error_detail,
                 "filename": file.filename if file and hasattr(file, 'filename') else None
             }
         )
     finally:
         # Clean up the temporary file
         if temp_pdf_path and os.path.exists(temp_pdf_path):
             try:
                 os.unlink(temp_pdf_path)
-            except Exception:
-                pass
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=False)

 from typing import Dict, Any
 import shutil
 import torch
+import asyncio
+from contextlib import asynccontextmanager
 # Add the parent directory to sys.path to import convert_pdf_to_md
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# Import the initialization function as well
+from pdf_converter.convert_pdf_to_md import convert_pdf, initialize_converter
 # --- Configuration for output directory ---
 # In Docker container, use /app prefix
+# Adjusted path assuming the app runs from /app in Docker
+base_dir = "/app" # Use /app for Docker environment
+if not os.path.exists(base_dir):
+    # Fallback for local testing (assuming run from project root)
+    base_dir = "."
+out_sub_dir = "docker_mineru/output"
+output_dir = os.path.join(base_dir, out_sub_dir)
 images_dir = os.path.join(output_dir, "images")
 # Create output directory if it doesn't exist
 os.makedirs(output_dir, exist_ok=True)
 os.makedirs(images_dir, exist_ok=True)
+print(f"Using output directory: {output_dir}") # Add log for debugging
 # --- End Configuration ---
+# --- Lifespan management for model loading ---
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Load the ML model during startup
+    print("Application startup: Initializing marker converter...")
+    loop = asyncio.get_event_loop()
+    # Run in executor to avoid blocking the event loop
+    await loop.run_in_executor(None, initialize_converter)
+    print("Marker converter initialization process finished.")
+    yield
+    # Clean up resources if needed during shutdown
+    print("Application shutdown.")
 # Application metadata
 app_description = """
+# PDF to Markdown Converter API (Optimized)
 This API provides PDF processing capabilities to convert PDF documents to Markdown format using marker.
+It pre-loads models for faster processing.
 ## Features:
 - PDF to Markdown conversion using marker
+- Optimized for faster startup and processing
 - Simple API interface
 """
 app = FastAPI(
     title="PDF to Markdown API",
     description=app_description,
+    version="1.1.0", # Version bump
+    lifespan=lifespan # Add the lifespan manager
 )
+# Add CORS middleware
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],
     allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
 )
+# Mount the output directory - Adjust mount path to be relative to API URL
+# We use output_dir for the actual file path, but /output for the URL path
 app.mount("/output", StaticFiles(directory=output_dir), name="output")
 # Health check endpoint
         "status": "healthy",
         "timestamp": datetime.now().isoformat(),
         "service": "pdf-to-markdown-converter",
+        "gpu": gpu_info,
+        "output_directory_used": output_dir # Add info for debugging
     }
 @app.post("/convert", tags=["PDF Processing"])
 async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
     """
+    Convert a PDF file to markdown using the pre-loaded marker converter.
     Parameters:
         file: The PDF file to process
     Returns:
+        A JSON object containing the conversion result
     """
     if not file.filename or not file.filename.lower().endswith('.pdf'):
+        raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")
     content = await file.read()
     temp_pdf_path = None
     try:
+        # Use a secure temporary directory within the app's writable space
+        # In Docker, /tmp should be writable by the 'user'
+        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False, dir="/tmp") as temp_pdf:
             temp_pdf.write(content)
             temp_pdf_path = temp_pdf.name
+        print(f"Temporary PDF saved to: {temp_pdf_path}")
+        # Get the base name of the file for the output
         filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0]
+        # Use the configured output_dir for saving the markdown file
         output_md_file = os.path.join(output_dir, f"{filename_without_ext}.md")
+        print(f"Output markdown path: {output_md_file}")
+        # Process the PDF using the pre-loaded converter
+        md_content = convert_pdf(temp_pdf_path, output_md_file)
+        # Construct the relative path for the URL response
+        # This path should correspond to the StaticFiles mount point
         relative_output_path = os.path.join("/output", f"{filename_without_ext}.md")
         return {
             "filename": file.filename,
             "status": "success",
+            # Consider omitting full content in response for performance/size
+            "markdown_preview": md_content[:1000] + "..." if md_content else "",
+            "output_file_url": relative_output_path
         }
     except Exception as e:
         error_detail = str(e)
         error_trace = traceback.format_exc()
+        print(f"Error processing PDF '{file.filename if file else 'N/A'}': {error_detail}")
         print(error_trace)
         return JSONResponse(
+            status_code=500,
             content={
                 "error": "Error processing PDF",
                 "detail": error_detail,
                 "filename": file.filename if file and hasattr(file, 'filename') else None
             }
         )
     finally:
         # Clean up the temporary file
         if temp_pdf_path and os.path.exists(temp_pdf_path):
             try:
                 os.unlink(temp_pdf_path)
+                print(f"Temporary file {temp_pdf_path} deleted.")
+            except Exception as unlink_err:
+                print(f"Error deleting temporary file {temp_pdf_path}: {unlink_err}")
+# Remove the old __main__ block if it exists, as CMD in Dockerfile handles startup
+# if __name__ == "__main__":
+#     import uvicorn
+#     uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=False)

pdf_converter/convert_pdf_to_md.py CHANGED Viewed

@@ -4,14 +4,48 @@ import sys
 from marker.config.parser import ConfigParser
 from marker.models import create_model_dict
 def convert_pdf(pdf_input_path, output_md_path=None):
     """
-    Convert PDF file to Markdown using marker.
     Args:
         pdf_input_path (str): Path to the input PDF file
         output_md_path (str, optional): Path where to save the output Markdown file. If None, markdown is only returned.
     Returns:
         str: The markdown text
     """
@@ -19,37 +53,25 @@ def convert_pdf(pdf_input_path, output_md_path=None):
     if not os.path.exists(pdf_input_path):
         raise FileNotFoundError(f"Input PDF file not found at '{pdf_input_path}'")
-    print(f"Starting conversion of '{pdf_input_path}'...")
     try:
-        # Create configuration, explicitly setting output format
-        config_parser = ConfigParser({'output_format': 'markdown'})
-        # Load models
-        models = create_model_dict()
-        # Get converter class and create converter
-        converter_cls = config_parser.get_converter_cls()
-        converter = converter_cls(
-            config=config_parser.generate_config_dict(),
-            artifact_dict=models,
-            processor_list=config_parser.get_processors(),
-            renderer=config_parser.get_renderer(),
-            llm_service=config_parser.get_llm_service()
-        )
-        # Convert the PDF to markdown using marker
-        result = converter(pdf_input_path)
         # Access the markdown content directly from the result object
         markdown_text = result.markdown
         # If output path is provided, save the markdown
         if output_md_path:
             output_dir = os.path.dirname(output_md_path)
             if output_dir and not os.path.exists(output_dir):
                 os.makedirs(output_dir, exist_ok=True)
             with open(output_md_path, "w", encoding="utf-8") as f:
                 f.write(markdown_text)
             print(f"Successfully saved markdown to '{output_md_path}'")

 from marker.config.parser import ConfigParser
 from marker.models import create_model_dict
+# Global variable to hold the pre-loaded converter
+_converter = None
+def initialize_converter():
+    """Initializes the marker converter models and stores it globally."""
+    global _converter
+    if _converter is None:
+        print("Initializing marker models...")
+        try:
+            # Create configuration, explicitly setting output format
+            # Potential optimization: Check if batch_multiplier or similar exists
+            config_parser = ConfigParser({'output_format': 'markdown'}) # Add batch_multiplier here if applicable
+            # Load models
+            # Potential optimization: Check if device mapping/multi-GPU is possible
+            models = create_model_dict() # Add device mapping here if applicable
+            # Get converter class and create converter
+            converter_cls = config_parser.get_converter_cls()
+            _converter = converter_cls(
+                config=config_parser.generate_config_dict(),
+                artifact_dict=models,
+                processor_list=config_parser.get_processors(),
+                renderer=config_parser.get_renderer(),
+                llm_service=config_parser.get_llm_service()
+            )
+            print("Marker models initialized successfully.")
+        except Exception as e:
+            print(f"Failed to initialize marker models: {e}", file=sys.stderr)
+            _converter = None # Ensure it's None if init fails
+            raise
+    else:
+        print("Marker models already initialized.")
 def convert_pdf(pdf_input_path, output_md_path=None):
     """
+    Convert PDF file to Markdown using the pre-loaded marker converter.
     Args:
         pdf_input_path (str): Path to the input PDF file
         output_md_path (str, optional): Path where to save the output Markdown file. If None, markdown is only returned.
     Returns:
         str: The markdown text
     """
     if not os.path.exists(pdf_input_path):
         raise FileNotFoundError(f"Input PDF file not found at '{pdf_input_path}'")
+    # Check if converter is initialized
+    if _converter is None:
+         raise RuntimeError("Marker converter has not been initialized. Call initialize_converter() during application startup.")
+    print(f"Starting conversion of '{pdf_input_path}' using pre-loaded models...")
     try:
+        # Convert the PDF to markdown using the pre-loaded converter
+        result = _converter(pdf_input_path)
         # Access the markdown content directly from the result object
         markdown_text = result.markdown
         # If output path is provided, save the markdown
         if output_md_path:
             output_dir = os.path.dirname(output_md_path)
             if output_dir and not os.path.exists(output_dir):
                 os.makedirs(output_dir, exist_ok=True)
             with open(output_md_path, "w", encoding="utf-8") as f:
                 f.write(markdown_text)
             print(f"Successfully saved markdown to '{output_md_path}'")