Spaces:

marcosremar2
/

docker_mineru

Sleeping

App Files Files Community

marcosremar2 commited on 10 days ago

Commit

3d9ca9a

1 Parent(s): 706cfdf

Update PDF to Markdown converter API

Browse files

Files changed (7) hide show

.gitattributes +1 -2
Dockerfile +21 -72
README.md +54 -55
app/main.py +131 -0
pdf_converter/__init__.py +1 -0
pdf_converter/convert_pdf_to_md.py +40 -0
requirements.txt +5 -3

.gitattributes CHANGED Viewed

@@ -25,11 +25,10 @@
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

Dockerfile CHANGED Viewed

@@ -15,11 +15,6 @@ RUN apt-get update && \
         wget \
         git \
         libgl1 \
-        libreoffice \
-        fonts-noto-cjk \
-        fonts-wqy-zenhei \
-        fonts-wqy-microhei \
-        ttf-mscorefonts-installer \
         fontconfig \
         libglib2.0-0 \
         libxrender1 \
@@ -30,78 +25,32 @@ RUN apt-get update && \
 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
-# Set up a non-root user
-RUN useradd -m -u 1000 user
-# Set home directory and update PATH
-ENV HOME=/home/user \
-	PATH=/home/user/.local/bin:$PATH
-# Set the working directory
-WORKDIR $HOME/app
-# Copy requirements first (as user)
-COPY --chown=user requirements.txt .
-# Install Python dependencies (as root to manage venv properly)
-# Note: Ensure PyTorch installed picks up CUDA from the base image
-RUN python3 -m venv /opt/mineru_venv && \
-    . /opt/mineru_venv/bin/activate && \
-    pip install --upgrade pip && \
-    pip install -r requirements.txt
-# Download model script and config template, set permissions (as root)
-RUN wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O $HOME/app/download_models.py && \
-    chmod +x $HOME/app/download_models.py && \
-    wget https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json -O $HOME/app/magic-pdf.json && \
-    chown user:user $HOME/app/download_models.py $HOME/app/magic-pdf.json
-# Create output directory and set ownership (as root)
-RUN mkdir -p $HOME/app/output/images && \
-    chown -R user:user $HOME/app/output
-# Switch to non-root user
-USER user
-# Run model download script and configure magic-pdf for CUDA
-# This should update $HOME/magic-pdf.json with correct cache paths
-RUN . /opt/mineru_venv/bin/activate && \
-    python3 $HOME/app/download_models.py && \
-    cp $HOME/app/magic-pdf.json $HOME/magic-pdf.json && \
-    sed -i 's|"device": "cpu"|"device": "cuda"|g' $HOME/magic-pdf.json
-# Debug: Print the content of the user's config file
-RUN echo "--- Contents of $HOME/magic-pdf.json --- " && cat $HOME/magic-pdf.json && echo "--- End of $HOME/magic-pdf.json --- "
-# Switch back to root temporarily to manage /root and /tmp
-USER root
-# Copy the final config to /root
-RUN cp $HOME/magic-pdf.json /root/magic-pdf.json && \
-    chown root:root /root/magic-pdf.json
-# Workaround: Copy YOLO model to the hardcoded path /tmp/models/...
-# Find the actual downloaded model path in the cache (using wildcard for snapshot hash)
-# Note: This assumes the download script places the model predictably within the user's cache.
-RUN mkdir -p /tmp/models/MFD/YOLO && \
-    find $HOME/.cache/huggingface/hub -name yolo_v8_ft.pt -exec cp {} /tmp/models/MFD/YOLO/yolo_v8_ft.pt \; && \
-    chown -R user:user /tmp/models && \
-    chmod -R 755 /tmp/models
-# Debug: Print the content of the root's config file
-RUN echo "--- Contents of /root/magic-pdf.json --- " && cat /root/magic-pdf.json && echo "--- End of /root/magic-pdf.json --- "
-# Switch back to the non-root user for running the app and copying files
-USER user
-# Copy the rest of the application code as the user
-COPY --chown=user . .
-# Ensure the output dir still has correct permissions
-RUN chmod -R 755 $HOME/app/output
-# Expose the port (optional but good practice)
 EXPOSE 7860
-# Run the application
-CMD ["/opt/mineru_venv/bin/uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

         wget \
         git \
         libgl1 \
         fontconfig \
         libglib2.0-0 \
         libxrender1 \
 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
+# Create app directory
+WORKDIR /app
+# Copy requirements first
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt && \
+    pip install --no-cache-dir torch==2.0.1+cu118 torchvision==0.15.2+cu118 --extra-index-url https://download.pytorch.org/whl/cu118
+# Create output directories
+RUN mkdir -p /app/output/images && \
+    chmod -R 777 /app/output
+# Copy application code
+COPY app/ /app/app/
+COPY pdf_converter/ /app/pdf_converter/
+COPY app.py .
+# Set environment variables for GPU
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
+# Expose port 7860 for Hugging Face Spaces
 EXPOSE 7860
+# Command to run the application on port 7860
+CMD ["python3", "app.py"]

README.md CHANGED Viewed

@@ -1,96 +1,95 @@
 ---
-title: MinerU PDF Processor
 emoji: 📄
 colorFrom: blue
 colorTo: indigo
 sdk: docker
 pinned: false
-license: apache-2.0
 app_port: 7860
 ---
-# MinerU PDF API
-A simple API for extracting text and tables from PDF documents using MinerU's magic-pdf library.
 ## Features
-- Extract text from PDF documents
-- Identify and extract tables from PDFs
-- Works with both regular and scanned PDFs
-- Simple JSON response format
-## API Endpoints
-### Health Check
-```
-GET /health
-```
-Returns the current status of the service.
-### Extract PDF Content
-```
-POST /extract
 ```
-Upload a PDF file to extract its text and tables.
-#### Request
-- `file`: The PDF file to process (multipart/form-data)
-#### Response
-JSON object containing:
-- `filename`: Original filename
-- `pages`: Array of pages with text and tables
-## Deployment
-This application is deployed as a Hugging Face Space using Docker.
-## Local Development
-To run this application locally:
-1. Install the requirements:
-   ```
-   pip install -r requirements.txt
-   ```
-2. Run the application:
-   ```
-   python app.py
-   ```
-3. Access the API at `http://localhost:7860`
-## Docker
-You can also build and run with Docker:
 ```bash
-docker build -t mineru-pdf-api .
-docker run -p 7860:7860 mineru-pdf-api
 ```
-## About
-This API is built on top of MinerU and magic-pdf, a powerful PDF extraction tool.
-## API Documentation
-Once deployed, you can access the auto-generated Swagger documentation at:
-```
-https://marcosremar2-docker-mineru.hf.space/docs
-```
-For ReDoc documentation:
-```
-https://marcosremar2-docker-mineru.hf.space/redoc
-```

 ---
+title: PDF to Markdown Converter
 emoji: 📄
 colorFrom: blue
 colorTo: indigo
 sdk: docker
 pinned: false
 app_port: 7860
 ---
+# PDF to Markdown Converter API
+A FastAPI-based service that converts PDF documents to Markdown format using the [marker](https://github.com/VikParuchuri/marker) library.
 ## Features
+- Convert PDF files to Markdown format
+- GPU-accelerated processing with CUDA support
+- Simple RESTful API
+- Docker containerization
+## Setup and Installation
+### Prerequisites
+- Docker
+- Docker Compose
+- NVIDIA Container Toolkit (for GPU support)
+### Building and Running the Container
+1. Clone this repository:
+```bash
+git clone <repository-url>
+cd docker_mineru
 ```
+2. Build and start the container:
+```bash
+docker-compose up -d
+```
+3. The API will be available at: `http://localhost:7860`
+## API Usage
+### Health Check
+```
+GET /health
+```
+Returns the current status of the service and whether CUDA is available.
+### Convert PDF to Markdown
+```
+POST /convert
+```
+Upload a PDF file to convert it to Markdown.
+#### Example cURL request:
 ```bash
+curl -X POST "http://localhost:7860/convert" \
+  -H "accept: application/json" \
+  -H "Content-Type: multipart/form-data" \
+  -F "file=@your_file.pdf"
 ```
+#### Response:
+```json
+{
+  "filename": "your_file.pdf",
+  "status": "success",
+  "markdown_content": "# Your PDF content in Markdown...",
+  "output_file": "/output/your_file.md"
+}
+```
+## Accessing the API Documentation
+Once the API is running, you can access the following:
+- Swagger UI: `http://localhost:7860/docs`
+- ReDoc: `http://localhost:7860/redoc`
+## Hugging Face Spaces Deployment
+This application is also deployed on Hugging Face Spaces. You can access it at:
+[https://huggingface.co/spaces/marcosremar2/docker_mineru](https://huggingface.co/spaces/marcosremar2/docker_mineru)

app/main.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+import tempfile
+import os
+import sys
+import traceback
+from datetime import datetime
+from typing import Dict, Any
+import shutil
+import torch
+# Add the parent directory to sys.path to import convert_pdf_to_md
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from pdf_converter import convert_pdf_to_md
+# Create output directory if it doesn't exist
+output_dir = "/app/output"
+images_dir = "/app/output/images"
+os.makedirs(output_dir, exist_ok=True)
+os.makedirs(images_dir, exist_ok=True)
+# Application metadata
+app_description = """
+# PDF to Markdown Converter API
+This API provides PDF processing capabilities to convert PDF documents to Markdown format using marker.
+## Features:
+- PDF to Markdown conversion using marker
+- Simple API interface
+"""
+app = FastAPI(
+    title="PDF to Markdown API",
+    description=app_description,
+    version="1.0.0",
+)
+# Add CORS middleware to allow cross-origin requests
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Allow all origins
+    allow_credentials=True,
+    allow_methods=["*"],  # Allow all methods
+    allow_headers=["*"],  # Allow all headers
+)
+# Mount the output directory as static files
+app.mount("/output", StaticFiles(directory="/app/output"), name="output")
+# Health check endpoint
+@app.get("/health", tags=["Health"])
+async def health_check() -> Dict[str, Any]:
+    """
+    Health check endpoint to verify the service is running.
+    Returns the service status and current time.
+    """
+    return {
+        "status": "healthy",
+        "timestamp": datetime.now().isoformat(),
+        "service": "pdf-to-markdown-converter",
+        "gpu": "CUDA enabled" if torch.cuda.is_available() else "CPU only"
+    }
+@app.post("/convert", tags=["PDF Processing"])
+async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
+    """
+    Convert a PDF file to markdown using marker.
+    Parameters:
+        file: The PDF file to process
+    Returns:
+        A JSON object containing the conversion result and markdown content
+    """
+    if not file.filename or not file.filename.lower().endswith('.pdf'):
+        raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
+    content = await file.read()
+    temp_pdf_path = None
+    try:
+        # Save the uploaded PDF to a temporary file
+        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
+            temp_pdf.write(content)
+            temp_pdf_path = temp_pdf.name
+        # Get the base name of the file
+        filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0]
+        output_md_file = f"/app/output/{filename_without_ext}.md"
+        # Process the PDF using marker
+        md_content = convert_pdf_to_md.convert_pdf(temp_pdf_path, output_md_file)
+        return {
+            "filename": file.filename,
+            "status": "success",
+            "markdown_content": md_content,
+            "output_file": f"/output/{filename_without_ext}.md"
+        }
+    except Exception as e:
+        error_detail = str(e)
+        error_trace = traceback.format_exc()
+        # Log the error
+        print(f"Error processing PDF: {error_detail}")
+        print(error_trace)
+        return JSONResponse(
+            status_code=500,
+            content={
+                "error": "Error processing PDF",
+                "detail": error_detail,
+                "filename": file.filename if file and hasattr(file, 'filename') else None
+            }
+        )
+    finally:
+        # Clean up the temporary file
+        if temp_pdf_path and os.path.exists(temp_pdf_path):
+            try:
+                os.unlink(temp_pdf_path)
+            except Exception:
+                pass
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=False)

pdf_converter/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

pdf_converter/convert_pdf_to_md.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import marker
+import os
+import sys
+def convert_pdf(pdf_input_path, output_md_path=None):
+    """
+    Convert PDF file to Markdown using marker.
+    Args:
+        pdf_input_path (str): Path to the input PDF file
+        output_md_path (str, optional): Path where to save the output Markdown file. If None, markdown is only returned.
+    Returns:
+        str: The markdown text
+    """
+    # Check if the input PDF exists
+    if not os.path.exists(pdf_input_path):
+        raise FileNotFoundError(f"Input PDF file not found at '{pdf_input_path}'")
+    print(f"Starting conversion of '{pdf_input_path}'...")
+    try:
+        # Convert the PDF to markdown using marker
+        markdown_text, _ = marker.convert(pdf_input_path)
+        # If output path is provided, save the markdown
+        if output_md_path:
+            output_dir = os.path.dirname(output_md_path)
+            if output_dir and not os.path.exists(output_dir):
+                os.makedirs(output_dir, exist_ok=True)
+            with open(output_md_path, "w", encoding="utf-8") as f:
+                f.write(markdown_text)
+            print(f"Successfully saved markdown to '{output_md_path}'")
+        return markdown_text
+    except Exception as e:
+        print(f"An error occurred during conversion: {e}", file=sys.stderr)
+        raise

requirements.txt CHANGED Viewed

@@ -1,5 +1,7 @@
-fastapi==0.100.0
 uvicorn==0.23.2
-magic-pdf[full]==1.3.10
 python-multipart==0.0.6
-requests>=2.32.3

+fastapi==0.104.1
 uvicorn==0.23.2
 python-multipart==0.0.6
+marker-pdf==1.2.4
+torch==2.0.1
+torchvision==0.15.2
+torchaudio==2.0.2