marcosremar2 commited on
Commit
d179ac1
·
1 Parent(s): 3d9ca9a

Update PDF to Markdown converter API with NVIDIA L4 support

Browse files
Files changed (2) hide show
  1. app.py +6 -167
  2. app/main.py +8 -1
app.py CHANGED
@@ -1,171 +1,10 @@
1
- from fastapi import FastAPI, UploadFile, File, HTTPException
2
- from fastapi.responses import JSONResponse
3
- from fastapi.middleware.cors import CORSMiddleware
4
- import tempfile
5
- import os
6
- import json
7
- import traceback
8
- from datetime import datetime
9
- from typing import Dict, List, Any, Optional
10
-
11
- # Import necessary components from magic_pdf based on convert_pdf.py
12
- from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
13
- from magic_pdf.data.dataset import PymuDocDataset
14
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
15
- from magic_pdf.config.enums import SupportedPdfParseMethod
16
-
17
- # Application metadata
18
- app_description = """
19
- # MinerU PDF Processor API
20
-
21
- This API provides PDF processing capabilities using MinerU's magic-pdf library.
22
- It extracts text content and generates markdown from PDF documents.
23
-
24
- ## Features:
25
- - PDF text extraction
26
- - Markdown conversion
27
- - Layout analysis (via output files)
28
  """
29
 
30
- app = FastAPI(
31
- title="MinerU PDF API",
32
- description=app_description,
33
- version="1.0.0",
34
- contact={
35
- "name": "PDF Converter Service",
36
- },
37
- )
38
-
39
- # Add CORS middleware to allow cross-origin requests
40
- app.add_middleware(
41
- CORSMiddleware,
42
- allow_origins=["*"], # Allow all origins
43
- allow_credentials=True,
44
- allow_methods=["*"], # Allow all methods
45
- allow_headers=["*"], # Allow all headers
46
- )
47
-
48
- # Define output directories (relative to the app's working directory in the container)
49
- local_image_dir, local_md_dir = "output/images", "output"
50
- os.makedirs(local_image_dir, exist_ok=True)
51
- os.makedirs(local_md_dir, exist_ok=True)
52
-
53
- # Health check endpoint
54
- @app.get("/health", tags=["Health"])
55
- async def health_check() -> Dict[str, Any]:
56
- """
57
- Health check endpoint to verify the service is running.
58
- Returns the service status and current time.
59
- """
60
- return {
61
- "status": "healthy",
62
- "timestamp": datetime.now().isoformat(),
63
- "service": "mineru-pdf-processor"
64
- }
65
-
66
- @app.post("/extract", tags=["PDF Processing"])
67
- async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
68
- """
69
- Process a PDF file using PymuDocDataset and return the extracted markdown content.
70
-
71
- Parameters:
72
- file: The PDF file to process
73
-
74
- Returns:
75
- A JSON object containing the extracted markdown and status.
76
- """
77
- if not file.filename or not file.filename.lower().endswith('.pdf'):
78
- raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
79
-
80
- content = await file.read()
81
- temp_pdf_path = None
82
-
83
- try:
84
- # Save the uploaded PDF to a temporary file
85
- with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
86
- temp_pdf.write(content)
87
- temp_pdf_path = temp_pdf.name
88
-
89
- # Clear previous output files (optional, depending on desired behavior)
90
- # You might want to handle output naming differently in a multi-user API context
91
- # For simplicity, we'll clear the output dir here like in convert_pdf.py
92
- for item in os.listdir(local_image_dir):
93
- os.remove(os.path.join(local_image_dir, item))
94
- for item in os.listdir(local_md_dir):
95
- if os.path.isfile(os.path.join(local_md_dir, item)):
96
- os.remove(os.path.join(local_md_dir, item))
97
-
98
- # Get filename and prepare output paths for magic-pdf
99
- pdf_file_name = os.path.basename(temp_pdf_path)
100
- name_without_suff = os.path.splitext(pdf_file_name)[0]
101
- image_dir_rel_path = str(os.path.basename(local_image_dir)) # Relative path for markdown image links
102
-
103
- # Setup writers
104
- image_writer = FileBasedDataWriter(local_image_dir)
105
- md_writer = FileBasedDataWriter(local_md_dir)
106
-
107
- # Use PymuDocDataset for processing
108
- ds = PymuDocDataset(content) # Pass pdf bytes directly
109
-
110
- # Inference and pipeline based on PDF type
111
- if ds.classify() == SupportedPdfParseMethod.OCR:
112
- infer_result = ds.apply(doc_analyze, ocr=True)
113
- pipe_result = infer_result.pipe_ocr_mode(image_writer)
114
- else:
115
- infer_result = ds.apply(doc_analyze, ocr=False)
116
- pipe_result = infer_result.pipe_txt_mode(image_writer)
117
-
118
- # Optional: Generate intermediate output files (comment out if not needed for API)
119
- infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
120
- pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
121
- pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
122
- pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir_rel_path)
123
- pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
124
-
125
- # Get markdown content
126
- md_content = pipe_result.get_markdown(image_dir_rel_path)
127
-
128
- # Dump markdown to file (optional for API, but useful for debugging/access)
129
- md_file_path = f"{name_without_suff}.md"
130
- pipe_result.dump_md(md_writer, md_file_path, image_dir_rel_path)
131
- print(f"Markdown saved to: {os.path.join(local_md_dir, md_file_path)}")
132
-
133
-
134
- # Return the markdown content in the response
135
- return {
136
- "filename": file.filename,
137
- "status": "success",
138
- "markdown_content": md_content
139
- # You could potentially add links to the generated files here if needed
140
- # "output_files": { ... }
141
- }
142
-
143
- except Exception as e:
144
- error_detail = str(e)
145
- error_trace = traceback.format_exc()
146
-
147
- # Log the error
148
- print(f"Error processing PDF: {error_detail}")
149
- print(error_trace)
150
-
151
- return JSONResponse(
152
- status_code=500,
153
- content={
154
- "error": "Error processing PDF",
155
- "detail": error_detail,
156
- "filename": file.filename if file and hasattr(file, 'filename') else None
157
- }
158
- )
159
-
160
- finally:
161
- # Clean up the temporary file
162
- if temp_pdf_path and os.path.exists(temp_pdf_path):
163
- try:
164
- os.unlink(temp_pdf_path)
165
- except Exception:
166
- pass
167
 
168
  if __name__ == "__main__":
169
- # Keep uvicorn import here for local running
170
- import uvicorn
171
- uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)
 
1
+ """
2
+ Simple entry point for Hugging Face Spaces.
3
+ This file redirects to the FastAPI app in the app directory.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  """
5
 
6
+ from app.main import app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  if __name__ == "__main__":
9
+ import uvicorn
10
+ uvicorn.run("app:app", host="0.0.0.0", port=7860)
 
app/main.py CHANGED
@@ -57,11 +57,18 @@ async def health_check() -> Dict[str, Any]:
57
  Health check endpoint to verify the service is running.
58
  Returns the service status and current time.
59
  """
 
 
 
 
 
 
 
60
  return {
61
  "status": "healthy",
62
  "timestamp": datetime.now().isoformat(),
63
  "service": "pdf-to-markdown-converter",
64
- "gpu": "CUDA enabled" if torch.cuda.is_available() else "CPU only"
65
  }
66
 
67
  @app.post("/convert", tags=["PDF Processing"])
 
57
  Health check endpoint to verify the service is running.
58
  Returns the service status and current time.
59
  """
60
+ gpu_info = {
61
+ "cuda_available": torch.cuda.is_available(),
62
+ "device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
63
+ "device_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A",
64
+ "current_device": torch.cuda.current_device() if torch.cuda.is_available() else -1
65
+ }
66
+
67
  return {
68
  "status": "healthy",
69
  "timestamp": datetime.now().isoformat(),
70
  "service": "pdf-to-markdown-converter",
71
+ "gpu": gpu_info
72
  }
73
 
74
  @app.post("/convert", tags=["PDF Processing"])