marcosremar2 commited on
Commit
3d9ca9a
·
1 Parent(s): 706cfdf

Update PDF to Markdown converter API

Browse files
.gitattributes CHANGED
@@ -25,11 +25,10 @@
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
29
  *.tgz filter=lfs diff=lfs merge=lfs -text
30
  *.wasm filter=lfs diff=lfs merge=lfs -text
31
  *.xz filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -15,11 +15,6 @@ RUN apt-get update && \
15
  wget \
16
  git \
17
  libgl1 \
18
- libreoffice \
19
- fonts-noto-cjk \
20
- fonts-wqy-zenhei \
21
- fonts-wqy-microhei \
22
- ttf-mscorefonts-installer \
23
  fontconfig \
24
  libglib2.0-0 \
25
  libxrender1 \
@@ -30,78 +25,32 @@ RUN apt-get update && \
30
 
31
  RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
32
 
33
- # Set up a non-root user
34
- RUN useradd -m -u 1000 user
35
 
36
- # Set home directory and update PATH
37
- ENV HOME=/home/user \
38
- PATH=/home/user/.local/bin:$PATH
39
 
40
- # Set the working directory
41
- WORKDIR $HOME/app
 
 
42
 
43
- # Copy requirements first (as user)
44
- COPY --chown=user requirements.txt .
 
45
 
46
- # Install Python dependencies (as root to manage venv properly)
47
- # Note: Ensure PyTorch installed picks up CUDA from the base image
48
- RUN python3 -m venv /opt/mineru_venv && \
49
- . /opt/mineru_venv/bin/activate && \
50
- pip install --upgrade pip && \
51
- pip install -r requirements.txt
52
 
53
- # Download model script and config template, set permissions (as root)
54
- RUN wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O $HOME/app/download_models.py && \
55
- chmod +x $HOME/app/download_models.py && \
56
- wget https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json -O $HOME/app/magic-pdf.json && \
57
- chown user:user $HOME/app/download_models.py $HOME/app/magic-pdf.json
58
 
59
- # Create output directory and set ownership (as root)
60
- RUN mkdir -p $HOME/app/output/images && \
61
- chown -R user:user $HOME/app/output
62
-
63
- # Switch to non-root user
64
- USER user
65
-
66
- # Run model download script and configure magic-pdf for CUDA
67
- # This should update $HOME/magic-pdf.json with correct cache paths
68
- RUN . /opt/mineru_venv/bin/activate && \
69
- python3 $HOME/app/download_models.py && \
70
- cp $HOME/app/magic-pdf.json $HOME/magic-pdf.json && \
71
- sed -i 's|"device": "cpu"|"device": "cuda"|g' $HOME/magic-pdf.json
72
-
73
- # Debug: Print the content of the user's config file
74
- RUN echo "--- Contents of $HOME/magic-pdf.json --- " && cat $HOME/magic-pdf.json && echo "--- End of $HOME/magic-pdf.json --- "
75
-
76
- # Switch back to root temporarily to manage /root and /tmp
77
- USER root
78
-
79
- # Copy the final config to /root
80
- RUN cp $HOME/magic-pdf.json /root/magic-pdf.json && \
81
- chown root:root /root/magic-pdf.json
82
-
83
- # Workaround: Copy YOLO model to the hardcoded path /tmp/models/...
84
- # Find the actual downloaded model path in the cache (using wildcard for snapshot hash)
85
- # Note: This assumes the download script places the model predictably within the user's cache.
86
- RUN mkdir -p /tmp/models/MFD/YOLO && \
87
- find $HOME/.cache/huggingface/hub -name yolo_v8_ft.pt -exec cp {} /tmp/models/MFD/YOLO/yolo_v8_ft.pt \; && \
88
- chown -R user:user /tmp/models && \
89
- chmod -R 755 /tmp/models
90
-
91
- # Debug: Print the content of the root's config file
92
- RUN echo "--- Contents of /root/magic-pdf.json --- " && cat /root/magic-pdf.json && echo "--- End of /root/magic-pdf.json --- "
93
-
94
- # Switch back to the non-root user for running the app and copying files
95
- USER user
96
-
97
- # Copy the rest of the application code as the user
98
- COPY --chown=user . .
99
-
100
- # Ensure the output dir still has correct permissions
101
- RUN chmod -R 755 $HOME/app/output
102
-
103
- # Expose the port (optional but good practice)
104
  EXPOSE 7860
105
 
106
- # Run the application
107
- CMD ["/opt/mineru_venv/bin/uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
15
  wget \
16
  git \
17
  libgl1 \
 
 
 
 
 
18
  fontconfig \
19
  libglib2.0-0 \
20
  libxrender1 \
 
25
 
26
  RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
27
 
28
+ # Create app directory
29
+ WORKDIR /app
30
 
31
+ # Copy requirements first
32
+ COPY requirements.txt .
 
33
 
34
+ # Install Python dependencies
35
+ RUN pip install --no-cache-dir --upgrade pip && \
36
+ pip install --no-cache-dir -r requirements.txt && \
37
+ pip install --no-cache-dir torch==2.0.1+cu118 torchvision==0.15.2+cu118 --extra-index-url https://download.pytorch.org/whl/cu118
38
 
39
+ # Create output directories
40
+ RUN mkdir -p /app/output/images && \
41
+ chmod -R 777 /app/output
42
 
43
+ # Copy application code
44
+ COPY app/ /app/app/
45
+ COPY pdf_converter/ /app/pdf_converter/
46
+ COPY app.py .
 
 
47
 
48
+ # Set environment variables for GPU
49
+ ENV NVIDIA_VISIBLE_DEVICES=all
50
+ ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 
 
51
 
52
+ # Expose port 7860 for Hugging Face Spaces
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  EXPOSE 7860
54
 
55
+ # Command to run the application on port 7860
56
+ CMD ["python3", "app.py"]
README.md CHANGED
@@ -1,96 +1,95 @@
1
  ---
2
- title: MinerU PDF Processor
3
  emoji: 📄
4
  colorFrom: blue
5
  colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
- license: apache-2.0
9
  app_port: 7860
10
  ---
11
 
12
- # MinerU PDF API
13
 
14
- A simple API for extracting text and tables from PDF documents using MinerU's magic-pdf library.
15
 
16
  ## Features
17
 
18
- - Extract text from PDF documents
19
- - Identify and extract tables from PDFs
20
- - Works with both regular and scanned PDFs
21
- - Simple JSON response format
22
 
23
- ## API Endpoints
24
 
25
- ### Health Check
26
 
27
- ```
28
- GET /health
29
- ```
30
 
31
- Returns the current status of the service.
32
 
33
- ### Extract PDF Content
34
 
35
- ```
36
- POST /extract
 
37
  ```
38
 
39
- Upload a PDF file to extract its text and tables.
40
 
41
- #### Request
42
-
43
- - `file`: The PDF file to process (multipart/form-data)
44
-
45
- #### Response
46
-
47
- JSON object containing:
48
- - `filename`: Original filename
49
- - `pages`: Array of pages with text and tables
50
 
51
- ## Deployment
52
 
53
- This application is deployed as a Hugging Face Space using Docker.
54
 
55
- ## Local Development
56
 
57
- To run this application locally:
 
 
58
 
59
- 1. Install the requirements:
60
- ```
61
- pip install -r requirements.txt
62
- ```
63
 
64
- 2. Run the application:
65
- ```
66
- python app.py
67
- ```
68
 
69
- 3. Access the API at `http://localhost:7860`
 
 
70
 
71
- ## Docker
72
 
73
- You can also build and run with Docker:
74
 
75
  ```bash
76
- docker build -t mineru-pdf-api .
77
- docker run -p 7860:7860 mineru-pdf-api
 
 
78
  ```
79
 
80
- ## About
81
 
82
- This API is built on top of MinerU and magic-pdf, a powerful PDF extraction tool.
 
 
 
 
 
 
 
83
 
84
- ## API Documentation
85
 
86
- Once deployed, you can access the auto-generated Swagger documentation at:
87
 
88
- ```
89
- https://marcosremar2-docker-mineru.hf.space/docs
90
- ```
91
 
92
- For ReDoc documentation:
93
 
94
- ```
95
- https://marcosremar2-docker-mineru.hf.space/redoc
96
- ```
 
1
  ---
2
+ title: PDF to Markdown Converter
3
  emoji: 📄
4
  colorFrom: blue
5
  colorTo: indigo
6
  sdk: docker
7
  pinned: false
 
8
  app_port: 7860
9
  ---
10
 
11
+ # PDF to Markdown Converter API
12
 
13
+ A FastAPI-based service that converts PDF documents to Markdown format using the [marker](https://github.com/VikParuchuri/marker) library.
14
 
15
  ## Features
16
 
17
+ - Convert PDF files to Markdown format
18
+ - GPU-accelerated processing with CUDA support
19
+ - Simple RESTful API
20
+ - Docker containerization
21
 
22
+ ## Setup and Installation
23
 
24
+ ### Prerequisites
25
 
26
+ - Docker
27
+ - Docker Compose
28
+ - NVIDIA Container Toolkit (for GPU support)
29
 
30
+ ### Building and Running the Container
31
 
32
+ 1. Clone this repository:
33
 
34
+ ```bash
35
+ git clone <repository-url>
36
+ cd docker_mineru
37
  ```
38
 
39
+ 2. Build and start the container:
40
 
41
+ ```bash
42
+ docker-compose up -d
43
+ ```
 
 
 
 
 
 
44
 
45
+ 3. The API will be available at: `http://localhost:7860`
46
 
47
+ ## API Usage
48
 
49
+ ### Health Check
50
 
51
+ ```
52
+ GET /health
53
+ ```
54
 
55
+ Returns the current status of the service and whether CUDA is available.
 
 
 
56
 
57
+ ### Convert PDF to Markdown
 
 
 
58
 
59
+ ```
60
+ POST /convert
61
+ ```
62
 
63
+ Upload a PDF file to convert it to Markdown.
64
 
65
+ #### Example cURL request:
66
 
67
  ```bash
68
+ curl -X POST "http://localhost:7860/convert" \
69
+ -H "accept: application/json" \
70
+ -H "Content-Type: multipart/form-data" \
71
+ -F "file=@your_file.pdf"
72
  ```
73
 
74
+ #### Response:
75
 
76
+ ```json
77
+ {
78
+ "filename": "your_file.pdf",
79
+ "status": "success",
80
+ "markdown_content": "# Your PDF content in Markdown...",
81
+ "output_file": "/output/your_file.md"
82
+ }
83
+ ```
84
 
85
+ ## Accessing the API Documentation
86
 
87
+ Once the API is running, you can access the following:
88
 
89
+ - Swagger UI: `http://localhost:7860/docs`
90
+ - ReDoc: `http://localhost:7860/redoc`
 
91
 
92
+ ## Hugging Face Spaces Deployment
93
 
94
+ This application is also deployed on Hugging Face Spaces. You can access it at:
95
+ [https://huggingface.co/spaces/marcosremar2/docker_mineru](https://huggingface.co/spaces/marcosremar2/docker_mineru)
 
app/main.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File, HTTPException
2
+ from fastapi.responses import JSONResponse
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from fastapi.staticfiles import StaticFiles
5
+ import tempfile
6
+ import os
7
+ import sys
8
+ import traceback
9
+ from datetime import datetime
10
+ from typing import Dict, Any
11
+ import shutil
12
+ import torch
13
+
14
+ # Add the parent directory to sys.path to import convert_pdf_to_md
15
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16
+ from pdf_converter import convert_pdf_to_md
17
+
18
+ # Create output directory if it doesn't exist
19
+ output_dir = "/app/output"
20
+ images_dir = "/app/output/images"
21
+ os.makedirs(output_dir, exist_ok=True)
22
+ os.makedirs(images_dir, exist_ok=True)
23
+
24
+ # Application metadata
25
+ app_description = """
26
+ # PDF to Markdown Converter API
27
+
28
+ This API provides PDF processing capabilities to convert PDF documents to Markdown format using marker.
29
+
30
+ ## Features:
31
+ - PDF to Markdown conversion using marker
32
+ - Simple API interface
33
+ """
34
+
35
+ app = FastAPI(
36
+ title="PDF to Markdown API",
37
+ description=app_description,
38
+ version="1.0.0",
39
+ )
40
+
41
+ # Add CORS middleware to allow cross-origin requests
42
+ app.add_middleware(
43
+ CORSMiddleware,
44
+ allow_origins=["*"], # Allow all origins
45
+ allow_credentials=True,
46
+ allow_methods=["*"], # Allow all methods
47
+ allow_headers=["*"], # Allow all headers
48
+ )
49
+
50
+ # Mount the output directory as static files
51
+ app.mount("/output", StaticFiles(directory="/app/output"), name="output")
52
+
53
+ # Health check endpoint
54
+ @app.get("/health", tags=["Health"])
55
+ async def health_check() -> Dict[str, Any]:
56
+ """
57
+ Health check endpoint to verify the service is running.
58
+ Returns the service status and current time.
59
+ """
60
+ return {
61
+ "status": "healthy",
62
+ "timestamp": datetime.now().isoformat(),
63
+ "service": "pdf-to-markdown-converter",
64
+ "gpu": "CUDA enabled" if torch.cuda.is_available() else "CPU only"
65
+ }
66
+
67
+ @app.post("/convert", tags=["PDF Processing"])
68
+ async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
69
+ """
70
+ Convert a PDF file to markdown using marker.
71
+
72
+ Parameters:
73
+ file: The PDF file to process
74
+
75
+ Returns:
76
+ A JSON object containing the conversion result and markdown content
77
+ """
78
+ if not file.filename or not file.filename.lower().endswith('.pdf'):
79
+ raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
80
+
81
+ content = await file.read()
82
+ temp_pdf_path = None
83
+
84
+ try:
85
+ # Save the uploaded PDF to a temporary file
86
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
87
+ temp_pdf.write(content)
88
+ temp_pdf_path = temp_pdf.name
89
+
90
+ # Get the base name of the file
91
+ filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0]
92
+ output_md_file = f"/app/output/{filename_without_ext}.md"
93
+
94
+ # Process the PDF using marker
95
+ md_content = convert_pdf_to_md.convert_pdf(temp_pdf_path, output_md_file)
96
+
97
+ return {
98
+ "filename": file.filename,
99
+ "status": "success",
100
+ "markdown_content": md_content,
101
+ "output_file": f"/output/{filename_without_ext}.md"
102
+ }
103
+
104
+ except Exception as e:
105
+ error_detail = str(e)
106
+ error_trace = traceback.format_exc()
107
+
108
+ # Log the error
109
+ print(f"Error processing PDF: {error_detail}")
110
+ print(error_trace)
111
+
112
+ return JSONResponse(
113
+ status_code=500,
114
+ content={
115
+ "error": "Error processing PDF",
116
+ "detail": error_detail,
117
+ "filename": file.filename if file and hasattr(file, 'filename') else None
118
+ }
119
+ )
120
+
121
+ finally:
122
+ # Clean up the temporary file
123
+ if temp_pdf_path and os.path.exists(temp_pdf_path):
124
+ try:
125
+ os.unlink(temp_pdf_path)
126
+ except Exception:
127
+ pass
128
+
129
+ if __name__ == "__main__":
130
+ import uvicorn
131
+ uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=False)
pdf_converter/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
pdf_converter/convert_pdf_to_md.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import marker
2
+ import os
3
+ import sys
4
+
5
+ def convert_pdf(pdf_input_path, output_md_path=None):
6
+ """
7
+ Convert PDF file to Markdown using marker.
8
+
9
+ Args:
10
+ pdf_input_path (str): Path to the input PDF file
11
+ output_md_path (str, optional): Path where to save the output Markdown file. If None, markdown is only returned.
12
+
13
+ Returns:
14
+ str: The markdown text
15
+ """
16
+ # Check if the input PDF exists
17
+ if not os.path.exists(pdf_input_path):
18
+ raise FileNotFoundError(f"Input PDF file not found at '{pdf_input_path}'")
19
+
20
+ print(f"Starting conversion of '{pdf_input_path}'...")
21
+
22
+ try:
23
+ # Convert the PDF to markdown using marker
24
+ markdown_text, _ = marker.convert(pdf_input_path)
25
+
26
+ # If output path is provided, save the markdown
27
+ if output_md_path:
28
+ output_dir = os.path.dirname(output_md_path)
29
+ if output_dir and not os.path.exists(output_dir):
30
+ os.makedirs(output_dir, exist_ok=True)
31
+
32
+ with open(output_md_path, "w", encoding="utf-8") as f:
33
+ f.write(markdown_text)
34
+ print(f"Successfully saved markdown to '{output_md_path}'")
35
+
36
+ return markdown_text
37
+
38
+ except Exception as e:
39
+ print(f"An error occurred during conversion: {e}", file=sys.stderr)
40
+ raise
requirements.txt CHANGED
@@ -1,5 +1,7 @@
1
- fastapi==0.100.0
2
  uvicorn==0.23.2
3
- magic-pdf[full]==1.3.10
4
  python-multipart==0.0.6
5
- requests>=2.32.3
 
 
 
 
1
+ fastapi==0.104.1
2
  uvicorn==0.23.2
 
3
  python-multipart==0.0.6
4
+ marker-pdf==1.2.4
5
+ torch==2.0.1
6
+ torchvision==0.15.2
7
+ torchaudio==2.0.2