File size: 6,154 Bytes
ab599b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse, FileResponse
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
import tempfile
import os
import json
import traceback
from datetime import datetime
from typing import Dict, List, Any, Optional
import shutil
from convert_pdf import convert_pdf

# Create output directory if it doesn't exist
os.makedirs("output", exist_ok=True)
os.makedirs("output/images", exist_ok=True)

# Application metadata
app_description = """
# MinerU PDF Processor API

This API provides PDF processing capabilities using MinerU's magic-pdf library.
It extracts text content, tables, and generates markdown from PDF documents.

## Features:
- PDF text extraction
- Markdown conversion
- Layout analysis
"""

app = FastAPI(
    title="MinerU PDF API",
    description=app_description,
    version="1.0.0",
    contact={
        "name": "PDF Converter Service",
    },
)

# Add CORS middleware to allow cross-origin requests
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Allow all origins
    allow_credentials=True,
    allow_methods=["*"],  # Allow all methods
    allow_headers=["*"],  # Allow all headers
)

# Mount the output directory as static files
app.mount("/output", StaticFiles(directory="output"), name="output")

# Health check endpoint
@app.get("/health", tags=["Health"])
async def health_check() -> Dict[str, Any]:
    """
    Health check endpoint to verify the service is running.
    Returns the service status and current time.
    """
    return {
        "status": "healthy",
        "timestamp": datetime.now().isoformat(),
        "service": "mineru-pdf-processor"
    }

@app.post("/convert", tags=["PDF Processing"])
async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
    """
    Convert a PDF file to markdown using the magic-pdf library.
    
    Parameters:
        file: The PDF file to process
        
    Returns:
        A JSON object containing the conversion result and links to output files
    """
    if not file.filename or not file.filename.lower().endswith('.pdf'):
        raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
    
    content = await file.read()
    temp_pdf_path = None
    
    try:
        # Save the uploaded PDF to a temporary file
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
            temp_pdf.write(content)
            temp_pdf_path = temp_pdf.name
        
        # Clear previous output files
        for item in os.listdir("output/images"):
            os.remove(os.path.join("output/images", item))
        for item in os.listdir("output"):
            if os.path.isfile(os.path.join("output", item)):
                os.remove(os.path.join("output", item))
        
        # Process the PDF using convert_pdf function
        md_content = convert_pdf(temp_pdf_path)
        
        # Get the base name of the processed file
        filename_without_ext = os.path.splitext(os.path.basename(temp_pdf_path))[0]
        
        # Gather the output files
        output_files = {}
        
        # Markdown file
        md_path = os.path.join("output", f"{filename_without_ext}.md")
        if os.path.exists(md_path):
            output_files["markdown"] = f"/output/{filename_without_ext}.md"
        
        # Layout PDF
        layout_path = os.path.join("output", f"{filename_without_ext}_layout.pdf")
        if os.path.exists(layout_path):
            output_files["layout"] = f"/output/{filename_without_ext}_layout.pdf"
        
        # Spans PDF
        spans_path = os.path.join("output", f"{filename_without_ext}_spans.pdf")
        if os.path.exists(spans_path):
            output_files["spans"] = f"/output/{filename_without_ext}_spans.pdf"
        
        # Model PDF
        model_path = os.path.join("output", f"{filename_without_ext}_model.pdf")
        if os.path.exists(model_path):
            output_files["model"] = f"/output/{filename_without_ext}_model.pdf"
        
        # Content list JSON
        content_list_path = os.path.join("output", f"{filename_without_ext}_content_list.json")
        if os.path.exists(content_list_path):
            output_files["content_list"] = f"/output/{filename_without_ext}_content_list.json"
        
        # Middle JSON
        middle_json_path = os.path.join("output", f"{filename_without_ext}_middle.json")
        if os.path.exists(middle_json_path):
            output_files["middle_json"] = f"/output/{filename_without_ext}_middle.json"
        
        return {
            "filename": file.filename,
            "status": "success",
            "markdown_content": md_content,
            "output_files": output_files
        }
    
    except Exception as e:
        error_detail = str(e)
        error_trace = traceback.format_exc()
        
        # Log the error
        print(f"Error processing PDF: {error_detail}")
        print(error_trace)
        
        return JSONResponse(
            status_code=500, 
            content={
                "error": "Error processing PDF",
                "detail": error_detail,
                "filename": file.filename if file and hasattr(file, 'filename') else None
            }
        )
    
    finally:
        # Clean up the temporary file
        if temp_pdf_path and os.path.exists(temp_pdf_path):
            try:
                os.unlink(temp_pdf_path)
            except Exception:
                pass

@app.get("/files/{filename}", tags=["Files"])
async def get_file(filename: str):
    """
    Get a file from the output directory.
    
    Parameters:
        filename: The name of the file to retrieve
        
    Returns:
        The requested file
    """
    file_path = os.path.join("output", filename)
    
    if not os.path.exists(file_path):
        raise HTTPException(status_code=404, detail=f"File {filename} not found")
    
    return FileResponse(path=file_path)

if __name__ == "__main__":
    import uvicorn
    uvicorn.run("api:app", host="0.0.0.0", port=7860, reload=False)