marcosremar2 commited on
Commit
a49c5dc
·
1 Parent(s): 5f5a1d2

Update PDF to Markdown converter API with NVIDIA L4 support

Browse files
Files changed (3) hide show
  1. Dockerfile +35 -22
  2. app/main.py +73 -41
  3. pdf_converter/convert_pdf_to_md.py +47 -25
Dockerfile CHANGED
@@ -20,7 +20,9 @@ RUN apt-get update && \
20
  libxrender1 \
21
  libsm6 \
22
  libxext6 \
23
- poppler-utils && \
 
 
24
  rm -rf /var/lib/apt/lists/*
25
 
26
  RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
@@ -29,33 +31,44 @@ RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
29
  RUN useradd -m -u 1000 user
30
 
31
  # Create necessary directories and set permissions
32
- RUN mkdir -p /app /app/docker_mineru /app/docker_mineru/output /app/docker_mineru/output/images && \
33
- chown -R user:user /app
34
 
35
- # Switch to user
36
- USER user
37
 
38
- # Set home directory
39
- ENV HOME=/home/user \
40
- PATH=/home/user/.local/bin:$PATH
41
 
42
- # Set working directory
43
- WORKDIR /app
 
 
 
 
 
 
44
 
45
- # Copy requirements first (with correct ownership)
46
- COPY --chown=user requirements.txt .
 
47
 
48
- # Install PyTorch dependencies with explicit compatible versions for NVIDIA L4
49
- RUN pip3 install --no-cache-dir --upgrade pip && \
50
- pip3 install --no-cache-dir torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu121 && \
51
- pip3 install --no-cache-dir transformers==4.36.2 && \
52
- pip3 install --no-cache-dir -r requirements.txt
 
 
 
 
53
 
54
- # Copy the rest of the application with correct ownership
55
- COPY --chown=user . .
 
56
 
57
- # Expose port
58
  EXPOSE 7860
59
 
60
- # Command to run the application
61
- CMD ["python3", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
20
  libxrender1 \
21
  libsm6 \
22
  libxext6 \
23
+ poppler-utils \
24
+ libjpeg-dev \
25
+ libpng-dev && \
26
  rm -rf /var/lib/apt/lists/*
27
 
28
  RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
 
31
  RUN useradd -m -u 1000 user
32
 
33
  # Create necessary directories and set permissions
34
+ RUN mkdir -p /app /app/docker_mineru/output/images /home/user/.cache/huggingface /home/user/.cache/torch && \
35
+ chown -R user:user /app /home/user
36
 
37
+ WORKDIR /app
 
38
 
39
+ # Copy requirements first
40
+ COPY --chown=user:user requirements.txt .
 
41
 
42
+ # Upgrade pip and install PyTorch dependencies first
43
+ # Use versions compatible with CUDA 12.1 and L40S
44
+ RUN pip install --no-cache-dir --upgrade pip && \
45
+ pip install --no-cache-dir \
46
+ torch==2.1.2 \
47
+ torchvision==0.16.2 \
48
+ torchaudio==2.1.2 \
49
+ --extra-index-url https://download.pytorch.org/whl/cu121
50
 
51
+ # Install other requirements including gunicorn
52
+ RUN pip install --no-cache-dir -r requirements.txt && \
53
+ pip install --no-cache-dir gunicorn
54
 
55
+ # Copy the rest of the application code
56
+ COPY --chown=user:user . .
57
+
58
+ # Ensure output directory exists and has correct permissions (redundant but safe)
59
+ RUN mkdir -p /app/docker_mineru/output/images && \
60
+ chown -R user:user /app/docker_mineru/output
61
+
62
+ # Set the user
63
+ USER user
64
 
65
+ # Environment variables for caching (optional, might help with model downloads)
66
+ ENV HF_HOME=/home/user/.cache/huggingface
67
+ ENV TORCH_HOME=/home/user/.cache/torch
68
 
69
+ # Expose the port
70
  EXPOSE 7860
71
 
72
+ # Command to run the application with Gunicorn and Uvicorn workers
73
+ # Start with 4 workers. Adjust based on monitoring L40S resources.
74
+ CMD ["gunicorn", "-w", "4", "-k", "uvicorn.workers.UvicornWorker", "app.main:app", "--bind", "0.0.0.0:7860"]
app/main.py CHANGED
@@ -10,49 +10,76 @@ from datetime import datetime
10
  from typing import Dict, Any
11
  import shutil
12
  import torch
 
 
13
 
14
  # Add the parent directory to sys.path to import convert_pdf_to_md
15
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16
- from pdf_converter import convert_pdf_to_md
 
17
 
18
  # --- Configuration for output directory ---
19
  # In Docker container, use /app prefix
20
- output_dir = "/app/docker_mineru/output"
 
 
 
 
 
 
21
 
22
  images_dir = os.path.join(output_dir, "images")
23
 
24
  # Create output directory if it doesn't exist
25
  os.makedirs(output_dir, exist_ok=True)
26
  os.makedirs(images_dir, exist_ok=True)
 
27
  # --- End Configuration ---
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  # Application metadata
30
  app_description = """
31
- # PDF to Markdown Converter API
32
 
33
  This API provides PDF processing capabilities to convert PDF documents to Markdown format using marker.
 
34
 
35
  ## Features:
36
  - PDF to Markdown conversion using marker
 
37
  - Simple API interface
38
  """
39
 
40
  app = FastAPI(
41
  title="PDF to Markdown API",
42
  description=app_description,
43
- version="1.0.0",
 
44
  )
45
 
46
- # Add CORS middleware to allow cross-origin requests
47
  app.add_middleware(
48
  CORSMiddleware,
49
- allow_origins=["*"], # Allow all origins
50
  allow_credentials=True,
51
- allow_methods=["*"], # Allow all methods
52
- allow_headers=["*"], # Allow all headers
53
  )
54
 
55
- # Mount the output directory as static files
 
56
  app.mount("/output", StaticFiles(directory=output_dir), name="output")
57
 
58
  # Health check endpoint
@@ -73,75 +100,80 @@ async def health_check() -> Dict[str, Any]:
73
  "status": "healthy",
74
  "timestamp": datetime.now().isoformat(),
75
  "service": "pdf-to-markdown-converter",
76
- "gpu": gpu_info
 
77
  }
78
 
79
  @app.post("/convert", tags=["PDF Processing"])
80
  async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
81
  """
82
- Convert a PDF file to markdown using marker.
83
-
84
  Parameters:
85
  file: The PDF file to process
86
-
87
  Returns:
88
- A JSON object containing the conversion result and markdown content
89
  """
90
  if not file.filename or not file.filename.lower().endswith('.pdf'):
91
- raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
92
-
93
  content = await file.read()
94
  temp_pdf_path = None
95
-
96
  try:
97
- # Save the uploaded PDF to a temporary file
98
- with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
 
99
  temp_pdf.write(content)
100
  temp_pdf_path = temp_pdf.name
101
-
102
- # Get the base name of the file
 
103
  filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0]
104
- # Use the configured output_dir
105
  output_md_file = os.path.join(output_dir, f"{filename_without_ext}.md")
106
-
107
- # Process the PDF using marker
108
- md_content = convert_pdf_to_md.convert_pdf(temp_pdf_path, output_md_file)
109
-
110
- # Construct the relative path for the response
 
 
111
  relative_output_path = os.path.join("/output", f"{filename_without_ext}.md")
112
 
113
  return {
114
  "filename": file.filename,
115
  "status": "success",
116
- "markdown_content": md_content,
117
- "output_file": relative_output_path
 
118
  }
119
-
120
  except Exception as e:
121
  error_detail = str(e)
122
  error_trace = traceback.format_exc()
123
-
124
- # Log the error
125
- print(f"Error processing PDF: {error_detail}")
126
  print(error_trace)
127
-
128
  return JSONResponse(
129
- status_code=500,
130
  content={
131
  "error": "Error processing PDF",
132
  "detail": error_detail,
133
  "filename": file.filename if file and hasattr(file, 'filename') else None
134
  }
135
  )
136
-
137
  finally:
138
  # Clean up the temporary file
139
  if temp_pdf_path and os.path.exists(temp_pdf_path):
140
  try:
141
  os.unlink(temp_pdf_path)
142
- except Exception:
143
- pass
 
144
 
145
- if __name__ == "__main__":
146
- import uvicorn
147
- uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=False)
 
 
10
  from typing import Dict, Any
11
  import shutil
12
  import torch
13
+ import asyncio
14
+ from contextlib import asynccontextmanager
15
 
16
  # Add the parent directory to sys.path to import convert_pdf_to_md
17
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
18
+ # Import the initialization function as well
19
+ from pdf_converter.convert_pdf_to_md import convert_pdf, initialize_converter
20
 
21
  # --- Configuration for output directory ---
22
  # In Docker container, use /app prefix
23
+ # Adjusted path assuming the app runs from /app in Docker
24
+ base_dir = "/app" # Use /app for Docker environment
25
+ if not os.path.exists(base_dir):
26
+ # Fallback for local testing (assuming run from project root)
27
+ base_dir = "."
28
+ out_sub_dir = "docker_mineru/output"
29
+ output_dir = os.path.join(base_dir, out_sub_dir)
30
 
31
  images_dir = os.path.join(output_dir, "images")
32
 
33
  # Create output directory if it doesn't exist
34
  os.makedirs(output_dir, exist_ok=True)
35
  os.makedirs(images_dir, exist_ok=True)
36
+ print(f"Using output directory: {output_dir}") # Add log for debugging
37
  # --- End Configuration ---
38
 
39
+ # --- Lifespan management for model loading ---
40
+ @asynccontextmanager
41
+ async def lifespan(app: FastAPI):
42
+ # Load the ML model during startup
43
+ print("Application startup: Initializing marker converter...")
44
+ loop = asyncio.get_event_loop()
45
+ # Run in executor to avoid blocking the event loop
46
+ await loop.run_in_executor(None, initialize_converter)
47
+ print("Marker converter initialization process finished.")
48
+ yield
49
+ # Clean up resources if needed during shutdown
50
+ print("Application shutdown.")
51
+
52
  # Application metadata
53
  app_description = """
54
+ # PDF to Markdown Converter API (Optimized)
55
 
56
  This API provides PDF processing capabilities to convert PDF documents to Markdown format using marker.
57
+ It pre-loads models for faster processing.
58
 
59
  ## Features:
60
  - PDF to Markdown conversion using marker
61
+ - Optimized for faster startup and processing
62
  - Simple API interface
63
  """
64
 
65
  app = FastAPI(
66
  title="PDF to Markdown API",
67
  description=app_description,
68
+ version="1.1.0", # Version bump
69
+ lifespan=lifespan # Add the lifespan manager
70
  )
71
 
72
+ # Add CORS middleware
73
  app.add_middleware(
74
  CORSMiddleware,
75
+ allow_origins=["*"],
76
  allow_credentials=True,
77
+ allow_methods=["*"],
78
+ allow_headers=["*"],
79
  )
80
 
81
+ # Mount the output directory - Adjust mount path to be relative to API URL
82
+ # We use output_dir for the actual file path, but /output for the URL path
83
  app.mount("/output", StaticFiles(directory=output_dir), name="output")
84
 
85
  # Health check endpoint
 
100
  "status": "healthy",
101
  "timestamp": datetime.now().isoformat(),
102
  "service": "pdf-to-markdown-converter",
103
+ "gpu": gpu_info,
104
+ "output_directory_used": output_dir # Add info for debugging
105
  }
106
 
107
  @app.post("/convert", tags=["PDF Processing"])
108
  async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
109
  """
110
+ Convert a PDF file to markdown using the pre-loaded marker converter.
111
+
112
  Parameters:
113
  file: The PDF file to process
114
+
115
  Returns:
116
+ A JSON object containing the conversion result
117
  """
118
  if not file.filename or not file.filename.lower().endswith('.pdf'):
119
+ raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")
120
+
121
  content = await file.read()
122
  temp_pdf_path = None
123
+
124
  try:
125
+ # Use a secure temporary directory within the app's writable space
126
+ # In Docker, /tmp should be writable by the 'user'
127
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False, dir="/tmp") as temp_pdf:
128
  temp_pdf.write(content)
129
  temp_pdf_path = temp_pdf.name
130
+ print(f"Temporary PDF saved to: {temp_pdf_path}")
131
+
132
+ # Get the base name of the file for the output
133
  filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0]
134
+ # Use the configured output_dir for saving the markdown file
135
  output_md_file = os.path.join(output_dir, f"{filename_without_ext}.md")
136
+ print(f"Output markdown path: {output_md_file}")
137
+
138
+ # Process the PDF using the pre-loaded converter
139
+ md_content = convert_pdf(temp_pdf_path, output_md_file)
140
+
141
+ # Construct the relative path for the URL response
142
+ # This path should correspond to the StaticFiles mount point
143
  relative_output_path = os.path.join("/output", f"{filename_without_ext}.md")
144
 
145
  return {
146
  "filename": file.filename,
147
  "status": "success",
148
+ # Consider omitting full content in response for performance/size
149
+ "markdown_preview": md_content[:1000] + "..." if md_content else "",
150
+ "output_file_url": relative_output_path
151
  }
152
+
153
  except Exception as e:
154
  error_detail = str(e)
155
  error_trace = traceback.format_exc()
156
+ print(f"Error processing PDF '{file.filename if file else 'N/A'}': {error_detail}")
 
 
157
  print(error_trace)
 
158
  return JSONResponse(
159
+ status_code=500,
160
  content={
161
  "error": "Error processing PDF",
162
  "detail": error_detail,
163
  "filename": file.filename if file and hasattr(file, 'filename') else None
164
  }
165
  )
166
+
167
  finally:
168
  # Clean up the temporary file
169
  if temp_pdf_path and os.path.exists(temp_pdf_path):
170
  try:
171
  os.unlink(temp_pdf_path)
172
+ print(f"Temporary file {temp_pdf_path} deleted.")
173
+ except Exception as unlink_err:
174
+ print(f"Error deleting temporary file {temp_pdf_path}: {unlink_err}")
175
 
176
+ # Remove the old __main__ block if it exists, as CMD in Dockerfile handles startup
177
+ # if __name__ == "__main__":
178
+ # import uvicorn
179
+ # uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=False)
pdf_converter/convert_pdf_to_md.py CHANGED
@@ -4,14 +4,48 @@ import sys
4
  from marker.config.parser import ConfigParser
5
  from marker.models import create_model_dict
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def convert_pdf(pdf_input_path, output_md_path=None):
8
  """
9
- Convert PDF file to Markdown using marker.
10
-
11
  Args:
12
  pdf_input_path (str): Path to the input PDF file
13
  output_md_path (str, optional): Path where to save the output Markdown file. If None, markdown is only returned.
14
-
15
  Returns:
16
  str: The markdown text
17
  """
@@ -19,37 +53,25 @@ def convert_pdf(pdf_input_path, output_md_path=None):
19
  if not os.path.exists(pdf_input_path):
20
  raise FileNotFoundError(f"Input PDF file not found at '{pdf_input_path}'")
21
 
22
- print(f"Starting conversion of '{pdf_input_path}'...")
 
 
 
 
23
 
24
  try:
25
- # Create configuration, explicitly setting output format
26
- config_parser = ConfigParser({'output_format': 'markdown'})
27
-
28
- # Load models
29
- models = create_model_dict()
30
-
31
- # Get converter class and create converter
32
- converter_cls = config_parser.get_converter_cls()
33
- converter = converter_cls(
34
- config=config_parser.generate_config_dict(),
35
- artifact_dict=models,
36
- processor_list=config_parser.get_processors(),
37
- renderer=config_parser.get_renderer(),
38
- llm_service=config_parser.get_llm_service()
39
- )
40
-
41
- # Convert the PDF to markdown using marker
42
- result = converter(pdf_input_path)
43
-
44
  # Access the markdown content directly from the result object
45
  markdown_text = result.markdown
46
-
47
  # If output path is provided, save the markdown
48
  if output_md_path:
49
  output_dir = os.path.dirname(output_md_path)
50
  if output_dir and not os.path.exists(output_dir):
51
  os.makedirs(output_dir, exist_ok=True)
52
-
53
  with open(output_md_path, "w", encoding="utf-8") as f:
54
  f.write(markdown_text)
55
  print(f"Successfully saved markdown to '{output_md_path}'")
 
4
  from marker.config.parser import ConfigParser
5
  from marker.models import create_model_dict
6
 
7
+ # Global variable to hold the pre-loaded converter
8
+ _converter = None
9
+
10
+ def initialize_converter():
11
+ """Initializes the marker converter models and stores it globally."""
12
+ global _converter
13
+ if _converter is None:
14
+ print("Initializing marker models...")
15
+ try:
16
+ # Create configuration, explicitly setting output format
17
+ # Potential optimization: Check if batch_multiplier or similar exists
18
+ config_parser = ConfigParser({'output_format': 'markdown'}) # Add batch_multiplier here if applicable
19
+
20
+ # Load models
21
+ # Potential optimization: Check if device mapping/multi-GPU is possible
22
+ models = create_model_dict() # Add device mapping here if applicable
23
+
24
+ # Get converter class and create converter
25
+ converter_cls = config_parser.get_converter_cls()
26
+ _converter = converter_cls(
27
+ config=config_parser.generate_config_dict(),
28
+ artifact_dict=models,
29
+ processor_list=config_parser.get_processors(),
30
+ renderer=config_parser.get_renderer(),
31
+ llm_service=config_parser.get_llm_service()
32
+ )
33
+ print("Marker models initialized successfully.")
34
+ except Exception as e:
35
+ print(f"Failed to initialize marker models: {e}", file=sys.stderr)
36
+ _converter = None # Ensure it's None if init fails
37
+ raise
38
+ else:
39
+ print("Marker models already initialized.")
40
+
41
  def convert_pdf(pdf_input_path, output_md_path=None):
42
  """
43
+ Convert PDF file to Markdown using the pre-loaded marker converter.
44
+
45
  Args:
46
  pdf_input_path (str): Path to the input PDF file
47
  output_md_path (str, optional): Path where to save the output Markdown file. If None, markdown is only returned.
48
+
49
  Returns:
50
  str: The markdown text
51
  """
 
53
  if not os.path.exists(pdf_input_path):
54
  raise FileNotFoundError(f"Input PDF file not found at '{pdf_input_path}'")
55
 
56
+ # Check if converter is initialized
57
+ if _converter is None:
58
+ raise RuntimeError("Marker converter has not been initialized. Call initialize_converter() during application startup.")
59
+
60
+ print(f"Starting conversion of '{pdf_input_path}' using pre-loaded models...")
61
 
62
  try:
63
+ # Convert the PDF to markdown using the pre-loaded converter
64
+ result = _converter(pdf_input_path)
65
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  # Access the markdown content directly from the result object
67
  markdown_text = result.markdown
68
+
69
  # If output path is provided, save the markdown
70
  if output_md_path:
71
  output_dir = os.path.dirname(output_md_path)
72
  if output_dir and not os.path.exists(output_dir):
73
  os.makedirs(output_dir, exist_ok=True)
74
+
75
  with open(output_md_path, "w", encoding="utf-8") as f:
76
  f.write(markdown_text)
77
  print(f"Successfully saved markdown to '{output_md_path}'")