Spaces:
Sleeping
Sleeping
Commit
·
a49c5dc
1
Parent(s):
5f5a1d2
Update PDF to Markdown converter API with NVIDIA L4 support
Browse files- Dockerfile +35 -22
- app/main.py +73 -41
- pdf_converter/convert_pdf_to_md.py +47 -25
Dockerfile
CHANGED
@@ -20,7 +20,9 @@ RUN apt-get update && \
|
|
20 |
libxrender1 \
|
21 |
libsm6 \
|
22 |
libxext6 \
|
23 |
-
poppler-utils
|
|
|
|
|
24 |
rm -rf /var/lib/apt/lists/*
|
25 |
|
26 |
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
|
@@ -29,33 +31,44 @@ RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
|
|
29 |
RUN useradd -m -u 1000 user
|
30 |
|
31 |
# Create necessary directories and set permissions
|
32 |
-
RUN mkdir -p /app /app/docker_mineru /
|
33 |
-
chown -R user:user /app
|
34 |
|
35 |
-
|
36 |
-
USER user
|
37 |
|
38 |
-
#
|
39 |
-
|
40 |
-
PATH=/home/user/.local/bin:$PATH
|
41 |
|
42 |
-
#
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
-
#
|
46 |
-
|
|
|
47 |
|
48 |
-
#
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
53 |
|
54 |
-
#
|
55 |
-
|
|
|
56 |
|
57 |
-
# Expose port
|
58 |
EXPOSE 7860
|
59 |
|
60 |
-
# Command to run the application
|
61 |
-
|
|
|
|
20 |
libxrender1 \
|
21 |
libsm6 \
|
22 |
libxext6 \
|
23 |
+
poppler-utils \
|
24 |
+
libjpeg-dev \
|
25 |
+
libpng-dev && \
|
26 |
rm -rf /var/lib/apt/lists/*
|
27 |
|
28 |
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
|
|
|
31 |
RUN useradd -m -u 1000 user
|
32 |
|
33 |
# Create necessary directories and set permissions
|
34 |
+
RUN mkdir -p /app /app/docker_mineru/output/images /home/user/.cache/huggingface /home/user/.cache/torch && \
|
35 |
+
chown -R user:user /app /home/user
|
36 |
|
37 |
+
WORKDIR /app
|
|
|
38 |
|
39 |
+
# Copy requirements first
|
40 |
+
COPY --chown=user:user requirements.txt .
|
|
|
41 |
|
42 |
+
# Upgrade pip and install PyTorch dependencies first
|
43 |
+
# Use versions compatible with CUDA 12.1 and L40S
|
44 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
45 |
+
pip install --no-cache-dir \
|
46 |
+
torch==2.1.2 \
|
47 |
+
torchvision==0.16.2 \
|
48 |
+
torchaudio==2.1.2 \
|
49 |
+
--extra-index-url https://download.pytorch.org/whl/cu121
|
50 |
|
51 |
+
# Install other requirements including gunicorn
|
52 |
+
RUN pip install --no-cache-dir -r requirements.txt && \
|
53 |
+
pip install --no-cache-dir gunicorn
|
54 |
|
55 |
+
# Copy the rest of the application code
|
56 |
+
COPY --chown=user:user . .
|
57 |
+
|
58 |
+
# Ensure output directory exists and has correct permissions (redundant but safe)
|
59 |
+
RUN mkdir -p /app/docker_mineru/output/images && \
|
60 |
+
chown -R user:user /app/docker_mineru/output
|
61 |
+
|
62 |
+
# Set the user
|
63 |
+
USER user
|
64 |
|
65 |
+
# Environment variables for caching (optional, might help with model downloads)
|
66 |
+
ENV HF_HOME=/home/user/.cache/huggingface
|
67 |
+
ENV TORCH_HOME=/home/user/.cache/torch
|
68 |
|
69 |
+
# Expose the port
|
70 |
EXPOSE 7860
|
71 |
|
72 |
+
# Command to run the application with Gunicorn and Uvicorn workers
|
73 |
+
# Start with 4 workers. Adjust based on monitoring L40S resources.
|
74 |
+
CMD ["gunicorn", "-w", "4", "-k", "uvicorn.workers.UvicornWorker", "app.main:app", "--bind", "0.0.0.0:7860"]
|
app/main.py
CHANGED
@@ -10,49 +10,76 @@ from datetime import datetime
|
|
10 |
from typing import Dict, Any
|
11 |
import shutil
|
12 |
import torch
|
|
|
|
|
13 |
|
14 |
# Add the parent directory to sys.path to import convert_pdf_to_md
|
15 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
16 |
-
|
|
|
17 |
|
18 |
# --- Configuration for output directory ---
|
19 |
# In Docker container, use /app prefix
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
images_dir = os.path.join(output_dir, "images")
|
23 |
|
24 |
# Create output directory if it doesn't exist
|
25 |
os.makedirs(output_dir, exist_ok=True)
|
26 |
os.makedirs(images_dir, exist_ok=True)
|
|
|
27 |
# --- End Configuration ---
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
# Application metadata
|
30 |
app_description = """
|
31 |
-
# PDF to Markdown Converter API
|
32 |
|
33 |
This API provides PDF processing capabilities to convert PDF documents to Markdown format using marker.
|
|
|
34 |
|
35 |
## Features:
|
36 |
- PDF to Markdown conversion using marker
|
|
|
37 |
- Simple API interface
|
38 |
"""
|
39 |
|
40 |
app = FastAPI(
|
41 |
title="PDF to Markdown API",
|
42 |
description=app_description,
|
43 |
-
version="1.
|
|
|
44 |
)
|
45 |
|
46 |
-
# Add CORS middleware
|
47 |
app.add_middleware(
|
48 |
CORSMiddleware,
|
49 |
-
allow_origins=["*"],
|
50 |
allow_credentials=True,
|
51 |
-
allow_methods=["*"],
|
52 |
-
allow_headers=["*"],
|
53 |
)
|
54 |
|
55 |
-
# Mount the output directory
|
|
|
56 |
app.mount("/output", StaticFiles(directory=output_dir), name="output")
|
57 |
|
58 |
# Health check endpoint
|
@@ -73,75 +100,80 @@ async def health_check() -> Dict[str, Any]:
|
|
73 |
"status": "healthy",
|
74 |
"timestamp": datetime.now().isoformat(),
|
75 |
"service": "pdf-to-markdown-converter",
|
76 |
-
"gpu": gpu_info
|
|
|
77 |
}
|
78 |
|
79 |
@app.post("/convert", tags=["PDF Processing"])
|
80 |
async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
|
81 |
"""
|
82 |
-
Convert a PDF file to markdown using marker.
|
83 |
-
|
84 |
Parameters:
|
85 |
file: The PDF file to process
|
86 |
-
|
87 |
Returns:
|
88 |
-
A JSON object containing the conversion result
|
89 |
"""
|
90 |
if not file.filename or not file.filename.lower().endswith('.pdf'):
|
91 |
-
raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF
|
92 |
-
|
93 |
content = await file.read()
|
94 |
temp_pdf_path = None
|
95 |
-
|
96 |
try:
|
97 |
-
#
|
98 |
-
|
|
|
99 |
temp_pdf.write(content)
|
100 |
temp_pdf_path = temp_pdf.name
|
101 |
-
|
102 |
-
|
|
|
103 |
filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0]
|
104 |
-
# Use the configured output_dir
|
105 |
output_md_file = os.path.join(output_dir, f"{filename_without_ext}.md")
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
111 |
relative_output_path = os.path.join("/output", f"{filename_without_ext}.md")
|
112 |
|
113 |
return {
|
114 |
"filename": file.filename,
|
115 |
"status": "success",
|
116 |
-
|
117 |
-
"
|
|
|
118 |
}
|
119 |
-
|
120 |
except Exception as e:
|
121 |
error_detail = str(e)
|
122 |
error_trace = traceback.format_exc()
|
123 |
-
|
124 |
-
# Log the error
|
125 |
-
print(f"Error processing PDF: {error_detail}")
|
126 |
print(error_trace)
|
127 |
-
|
128 |
return JSONResponse(
|
129 |
-
status_code=500,
|
130 |
content={
|
131 |
"error": "Error processing PDF",
|
132 |
"detail": error_detail,
|
133 |
"filename": file.filename if file and hasattr(file, 'filename') else None
|
134 |
}
|
135 |
)
|
136 |
-
|
137 |
finally:
|
138 |
# Clean up the temporary file
|
139 |
if temp_pdf_path and os.path.exists(temp_pdf_path):
|
140 |
try:
|
141 |
os.unlink(temp_pdf_path)
|
142 |
-
|
143 |
-
|
|
|
144 |
|
145 |
-
if
|
146 |
-
|
147 |
-
|
|
|
|
10 |
from typing import Dict, Any
|
11 |
import shutil
|
12 |
import torch
|
13 |
+
import asyncio
|
14 |
+
from contextlib import asynccontextmanager
|
15 |
|
16 |
# Add the parent directory to sys.path to import convert_pdf_to_md
|
17 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
18 |
+
# Import the initialization function as well
|
19 |
+
from pdf_converter.convert_pdf_to_md import convert_pdf, initialize_converter
|
20 |
|
21 |
# --- Configuration for output directory ---
|
22 |
# In Docker container, use /app prefix
|
23 |
+
# Adjusted path assuming the app runs from /app in Docker
|
24 |
+
base_dir = "/app" # Use /app for Docker environment
|
25 |
+
if not os.path.exists(base_dir):
|
26 |
+
# Fallback for local testing (assuming run from project root)
|
27 |
+
base_dir = "."
|
28 |
+
out_sub_dir = "docker_mineru/output"
|
29 |
+
output_dir = os.path.join(base_dir, out_sub_dir)
|
30 |
|
31 |
images_dir = os.path.join(output_dir, "images")
|
32 |
|
33 |
# Create output directory if it doesn't exist
|
34 |
os.makedirs(output_dir, exist_ok=True)
|
35 |
os.makedirs(images_dir, exist_ok=True)
|
36 |
+
print(f"Using output directory: {output_dir}") # Add log for debugging
|
37 |
# --- End Configuration ---
|
38 |
|
39 |
+
# --- Lifespan management for model loading ---
|
40 |
+
@asynccontextmanager
|
41 |
+
async def lifespan(app: FastAPI):
|
42 |
+
# Load the ML model during startup
|
43 |
+
print("Application startup: Initializing marker converter...")
|
44 |
+
loop = asyncio.get_event_loop()
|
45 |
+
# Run in executor to avoid blocking the event loop
|
46 |
+
await loop.run_in_executor(None, initialize_converter)
|
47 |
+
print("Marker converter initialization process finished.")
|
48 |
+
yield
|
49 |
+
# Clean up resources if needed during shutdown
|
50 |
+
print("Application shutdown.")
|
51 |
+
|
52 |
# Application metadata
|
53 |
app_description = """
|
54 |
+
# PDF to Markdown Converter API (Optimized)
|
55 |
|
56 |
This API provides PDF processing capabilities to convert PDF documents to Markdown format using marker.
|
57 |
+
It pre-loads models for faster processing.
|
58 |
|
59 |
## Features:
|
60 |
- PDF to Markdown conversion using marker
|
61 |
+
- Optimized for faster startup and processing
|
62 |
- Simple API interface
|
63 |
"""
|
64 |
|
65 |
app = FastAPI(
|
66 |
title="PDF to Markdown API",
|
67 |
description=app_description,
|
68 |
+
version="1.1.0", # Version bump
|
69 |
+
lifespan=lifespan # Add the lifespan manager
|
70 |
)
|
71 |
|
72 |
+
# Add CORS middleware
|
73 |
app.add_middleware(
|
74 |
CORSMiddleware,
|
75 |
+
allow_origins=["*"],
|
76 |
allow_credentials=True,
|
77 |
+
allow_methods=["*"],
|
78 |
+
allow_headers=["*"],
|
79 |
)
|
80 |
|
81 |
+
# Mount the output directory - Adjust mount path to be relative to API URL
|
82 |
+
# We use output_dir for the actual file path, but /output for the URL path
|
83 |
app.mount("/output", StaticFiles(directory=output_dir), name="output")
|
84 |
|
85 |
# Health check endpoint
|
|
|
100 |
"status": "healthy",
|
101 |
"timestamp": datetime.now().isoformat(),
|
102 |
"service": "pdf-to-markdown-converter",
|
103 |
+
"gpu": gpu_info,
|
104 |
+
"output_directory_used": output_dir # Add info for debugging
|
105 |
}
|
106 |
|
107 |
@app.post("/convert", tags=["PDF Processing"])
|
108 |
async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
|
109 |
"""
|
110 |
+
Convert a PDF file to markdown using the pre-loaded marker converter.
|
111 |
+
|
112 |
Parameters:
|
113 |
file: The PDF file to process
|
114 |
+
|
115 |
Returns:
|
116 |
+
A JSON object containing the conversion result
|
117 |
"""
|
118 |
if not file.filename or not file.filename.lower().endswith('.pdf'):
|
119 |
+
raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")
|
120 |
+
|
121 |
content = await file.read()
|
122 |
temp_pdf_path = None
|
123 |
+
|
124 |
try:
|
125 |
+
# Use a secure temporary directory within the app's writable space
|
126 |
+
# In Docker, /tmp should be writable by the 'user'
|
127 |
+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False, dir="/tmp") as temp_pdf:
|
128 |
temp_pdf.write(content)
|
129 |
temp_pdf_path = temp_pdf.name
|
130 |
+
print(f"Temporary PDF saved to: {temp_pdf_path}")
|
131 |
+
|
132 |
+
# Get the base name of the file for the output
|
133 |
filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0]
|
134 |
+
# Use the configured output_dir for saving the markdown file
|
135 |
output_md_file = os.path.join(output_dir, f"{filename_without_ext}.md")
|
136 |
+
print(f"Output markdown path: {output_md_file}")
|
137 |
+
|
138 |
+
# Process the PDF using the pre-loaded converter
|
139 |
+
md_content = convert_pdf(temp_pdf_path, output_md_file)
|
140 |
+
|
141 |
+
# Construct the relative path for the URL response
|
142 |
+
# This path should correspond to the StaticFiles mount point
|
143 |
relative_output_path = os.path.join("/output", f"{filename_without_ext}.md")
|
144 |
|
145 |
return {
|
146 |
"filename": file.filename,
|
147 |
"status": "success",
|
148 |
+
# Consider omitting full content in response for performance/size
|
149 |
+
"markdown_preview": md_content[:1000] + "..." if md_content else "",
|
150 |
+
"output_file_url": relative_output_path
|
151 |
}
|
152 |
+
|
153 |
except Exception as e:
|
154 |
error_detail = str(e)
|
155 |
error_trace = traceback.format_exc()
|
156 |
+
print(f"Error processing PDF '{file.filename if file else 'N/A'}': {error_detail}")
|
|
|
|
|
157 |
print(error_trace)
|
|
|
158 |
return JSONResponse(
|
159 |
+
status_code=500,
|
160 |
content={
|
161 |
"error": "Error processing PDF",
|
162 |
"detail": error_detail,
|
163 |
"filename": file.filename if file and hasattr(file, 'filename') else None
|
164 |
}
|
165 |
)
|
166 |
+
|
167 |
finally:
|
168 |
# Clean up the temporary file
|
169 |
if temp_pdf_path and os.path.exists(temp_pdf_path):
|
170 |
try:
|
171 |
os.unlink(temp_pdf_path)
|
172 |
+
print(f"Temporary file {temp_pdf_path} deleted.")
|
173 |
+
except Exception as unlink_err:
|
174 |
+
print(f"Error deleting temporary file {temp_pdf_path}: {unlink_err}")
|
175 |
|
176 |
+
# Remove the old __main__ block if it exists, as CMD in Dockerfile handles startup
|
177 |
+
# if __name__ == "__main__":
|
178 |
+
# import uvicorn
|
179 |
+
# uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=False)
|
pdf_converter/convert_pdf_to_md.py
CHANGED
@@ -4,14 +4,48 @@ import sys
|
|
4 |
from marker.config.parser import ConfigParser
|
5 |
from marker.models import create_model_dict
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
def convert_pdf(pdf_input_path, output_md_path=None):
|
8 |
"""
|
9 |
-
Convert PDF file to Markdown using marker.
|
10 |
-
|
11 |
Args:
|
12 |
pdf_input_path (str): Path to the input PDF file
|
13 |
output_md_path (str, optional): Path where to save the output Markdown file. If None, markdown is only returned.
|
14 |
-
|
15 |
Returns:
|
16 |
str: The markdown text
|
17 |
"""
|
@@ -19,37 +53,25 @@ def convert_pdf(pdf_input_path, output_md_path=None):
|
|
19 |
if not os.path.exists(pdf_input_path):
|
20 |
raise FileNotFoundError(f"Input PDF file not found at '{pdf_input_path}'")
|
21 |
|
22 |
-
|
|
|
|
|
|
|
|
|
23 |
|
24 |
try:
|
25 |
-
#
|
26 |
-
|
27 |
-
|
28 |
-
# Load models
|
29 |
-
models = create_model_dict()
|
30 |
-
|
31 |
-
# Get converter class and create converter
|
32 |
-
converter_cls = config_parser.get_converter_cls()
|
33 |
-
converter = converter_cls(
|
34 |
-
config=config_parser.generate_config_dict(),
|
35 |
-
artifact_dict=models,
|
36 |
-
processor_list=config_parser.get_processors(),
|
37 |
-
renderer=config_parser.get_renderer(),
|
38 |
-
llm_service=config_parser.get_llm_service()
|
39 |
-
)
|
40 |
-
|
41 |
-
# Convert the PDF to markdown using marker
|
42 |
-
result = converter(pdf_input_path)
|
43 |
-
|
44 |
# Access the markdown content directly from the result object
|
45 |
markdown_text = result.markdown
|
46 |
-
|
47 |
# If output path is provided, save the markdown
|
48 |
if output_md_path:
|
49 |
output_dir = os.path.dirname(output_md_path)
|
50 |
if output_dir and not os.path.exists(output_dir):
|
51 |
os.makedirs(output_dir, exist_ok=True)
|
52 |
-
|
53 |
with open(output_md_path, "w", encoding="utf-8") as f:
|
54 |
f.write(markdown_text)
|
55 |
print(f"Successfully saved markdown to '{output_md_path}'")
|
|
|
4 |
from marker.config.parser import ConfigParser
|
5 |
from marker.models import create_model_dict
|
6 |
|
7 |
+
# Global variable to hold the pre-loaded converter
|
8 |
+
_converter = None
|
9 |
+
|
10 |
+
def initialize_converter():
|
11 |
+
"""Initializes the marker converter models and stores it globally."""
|
12 |
+
global _converter
|
13 |
+
if _converter is None:
|
14 |
+
print("Initializing marker models...")
|
15 |
+
try:
|
16 |
+
# Create configuration, explicitly setting output format
|
17 |
+
# Potential optimization: Check if batch_multiplier or similar exists
|
18 |
+
config_parser = ConfigParser({'output_format': 'markdown'}) # Add batch_multiplier here if applicable
|
19 |
+
|
20 |
+
# Load models
|
21 |
+
# Potential optimization: Check if device mapping/multi-GPU is possible
|
22 |
+
models = create_model_dict() # Add device mapping here if applicable
|
23 |
+
|
24 |
+
# Get converter class and create converter
|
25 |
+
converter_cls = config_parser.get_converter_cls()
|
26 |
+
_converter = converter_cls(
|
27 |
+
config=config_parser.generate_config_dict(),
|
28 |
+
artifact_dict=models,
|
29 |
+
processor_list=config_parser.get_processors(),
|
30 |
+
renderer=config_parser.get_renderer(),
|
31 |
+
llm_service=config_parser.get_llm_service()
|
32 |
+
)
|
33 |
+
print("Marker models initialized successfully.")
|
34 |
+
except Exception as e:
|
35 |
+
print(f"Failed to initialize marker models: {e}", file=sys.stderr)
|
36 |
+
_converter = None # Ensure it's None if init fails
|
37 |
+
raise
|
38 |
+
else:
|
39 |
+
print("Marker models already initialized.")
|
40 |
+
|
41 |
def convert_pdf(pdf_input_path, output_md_path=None):
|
42 |
"""
|
43 |
+
Convert PDF file to Markdown using the pre-loaded marker converter.
|
44 |
+
|
45 |
Args:
|
46 |
pdf_input_path (str): Path to the input PDF file
|
47 |
output_md_path (str, optional): Path where to save the output Markdown file. If None, markdown is only returned.
|
48 |
+
|
49 |
Returns:
|
50 |
str: The markdown text
|
51 |
"""
|
|
|
53 |
if not os.path.exists(pdf_input_path):
|
54 |
raise FileNotFoundError(f"Input PDF file not found at '{pdf_input_path}'")
|
55 |
|
56 |
+
# Check if converter is initialized
|
57 |
+
if _converter is None:
|
58 |
+
raise RuntimeError("Marker converter has not been initialized. Call initialize_converter() during application startup.")
|
59 |
+
|
60 |
+
print(f"Starting conversion of '{pdf_input_path}' using pre-loaded models...")
|
61 |
|
62 |
try:
|
63 |
+
# Convert the PDF to markdown using the pre-loaded converter
|
64 |
+
result = _converter(pdf_input_path)
|
65 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
# Access the markdown content directly from the result object
|
67 |
markdown_text = result.markdown
|
68 |
+
|
69 |
# If output path is provided, save the markdown
|
70 |
if output_md_path:
|
71 |
output_dir = os.path.dirname(output_md_path)
|
72 |
if output_dir and not os.path.exists(output_dir):
|
73 |
os.makedirs(output_dir, exist_ok=True)
|
74 |
+
|
75 |
with open(output_md_path, "w", encoding="utf-8") as f:
|
76 |
f.write(markdown_text)
|
77 |
print(f"Successfully saved markdown to '{output_md_path}'")
|