marcosremar2 commited on
Commit
f30c298
·
1 Parent(s): c1e65a1

Enhance FastAPI implementation with better documentation, error handling and examples

Browse files
Files changed (3) hide show
  1. README.md +72 -3
  2. app.py +93 -8
  3. requirements.txt +4 -3
README.md CHANGED
@@ -9,8 +9,77 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- # MinerU PDF Extractor (Docker Space)
13
 
14
- This Hugging Face Space uses `magic-pdf` to extract structured content from PDFs using FastAPI.
15
 
16
- Send a `POST` request to `/extract` with a PDF file to receive extracted results.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  pinned: false
10
  ---
11
 
12
+ # MinerU PDF Extractor API
13
 
14
+ This Hugging Face Space provides a FastAPI-based service that uses `magic-pdf` to extract structured content from PDFs. The service exposes REST endpoints to process PDF files and return extracted text and tables in a structured JSON format.
15
 
16
+ ## API Endpoints
17
+
18
+ ### Health Check
19
+
20
+ ```
21
+ GET /health
22
+ ```
23
+
24
+ Returns the service status and timestamp.
25
+
26
+ ### Extract PDF Content
27
+
28
+ ```
29
+ POST /extract
30
+ ```
31
+
32
+ Upload a PDF file to extract its text content and tables.
33
+
34
+ #### Request
35
+
36
+ - Content-Type: multipart/form-data
37
+ - Body: PDF file in the 'file' field
38
+
39
+ #### Response
40
+
41
+ JSON object containing:
42
+ - Filename
43
+ - Pages with extracted text
44
+ - Tables in Markdown format
45
+
46
+ ## Usage Examples
47
+
48
+ ### Using cURL
49
+
50
+ ```bash
51
+ curl -X POST "https://marcosremar2-docker-mineru.hf.space/extract" \
52
+ -H "Content-Type: multipart/form-data" \
53
+ -F "file=@your_document.pdf" \
54
+ --output result.json
55
+ ```
56
+
57
+ ### Using Python
58
+
59
+ ```python
60
+ import requests
61
+
62
+ url = "https://marcosremar2-docker-mineru.hf.space/extract"
63
+ files = {"file": open("your_document.pdf", "rb")}
64
+
65
+ response = requests.post(url, files=files)
66
+ data = response.json()
67
+
68
+ # Process the extracted data
69
+ print(f"Filename: {data['result']['filename']}")
70
+ print(f"Number of pages: {len(data['result']['pages'])}")
71
+ ```
72
+
73
+ ## API Documentation
74
+
75
+ Once deployed, you can access the auto-generated Swagger documentation at:
76
+
77
+ ```
78
+ https://marcosremar2-docker-mineru.hf.space/docs
79
+ ```
80
+
81
+ For ReDoc documentation:
82
+
83
+ ```
84
+ https://marcosremar2-docker-mineru.hf.space/redoc
85
+ ```
app.py CHANGED
@@ -1,15 +1,76 @@
1
- from fastapi import FastAPI, UploadFile, File
2
  from fastapi.responses import JSONResponse
 
3
  import magic_pdf
4
  import tempfile
5
  import os
6
  import json
 
 
 
 
7
 
8
- app = FastAPI()
 
 
9
 
10
- @app.post("/extract")
11
- async def extract(file: UploadFile = File(...)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  content = await file.read()
 
 
13
  try:
14
  # Save the uploaded PDF to a temporary file
15
  with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
@@ -21,6 +82,7 @@ async def extract(file: UploadFile = File(...)):
21
 
22
  # Convert result to dictionary
23
  output = {
 
24
  "pages": []
25
  }
26
 
@@ -36,9 +98,32 @@ async def extract(file: UploadFile = File(...)):
36
 
37
  output["pages"].append(page_data)
38
 
39
- # Clean up the temporary file
40
- os.unlink(temp_pdf_path)
41
-
42
  return {"result": output}
 
43
  except Exception as e:
44
- return JSONResponse(status_code=500, content={"error": str(e)})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File, HTTPException
2
  from fastapi.responses import JSONResponse
3
+ from fastapi.middleware.cors import CORSMiddleware
4
  import magic_pdf
5
  import tempfile
6
  import os
7
  import json
8
+ import traceback
9
+ import uvicorn
10
+ from datetime import datetime
11
+ from typing import Dict, List, Any, Optional
12
 
13
+ # Application metadata
14
+ app_description = """
15
+ # MinerU PDF Processor API
16
 
17
+ This API provides PDF processing capabilities using MinerU's magic-pdf library.
18
+ It extracts text content and tables from PDF documents.
19
+
20
+ ## Features:
21
+ - PDF text extraction
22
+ - Table detection and extraction
23
+ - JSON response for easy integration
24
+ """
25
+
26
+ app = FastAPI(
27
+ title="MinerU PDF API",
28
+ description=app_description,
29
+ version="1.0.0",
30
+ contact={
31
+ "name": "PDF Converter Service",
32
+ },
33
+ )
34
+
35
+ # Add CORS middleware to allow cross-origin requests
36
+ app.add_middleware(
37
+ CORSMiddleware,
38
+ allow_origins=["*"], # Allow all origins
39
+ allow_credentials=True,
40
+ allow_methods=["*"], # Allow all methods
41
+ allow_headers=["*"], # Allow all headers
42
+ )
43
+
44
+ # Health check endpoint
45
+ @app.get("/health", tags=["Health"])
46
+ async def health_check() -> Dict[str, Any]:
47
+ """
48
+ Health check endpoint to verify the service is running.
49
+ Returns the service status and current time.
50
+ """
51
+ return {
52
+ "status": "healthy",
53
+ "timestamp": datetime.now().isoformat(),
54
+ "service": "mineru-pdf-processor"
55
+ }
56
+
57
+ @app.post("/extract", tags=["PDF Processing"])
58
+ async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
59
+ """
60
+ Extract text and tables from a PDF file.
61
+
62
+ Parameters:
63
+ file: The PDF file to process
64
+
65
+ Returns:
66
+ A JSON object containing the extracted content with pages, text blocks, and tables
67
+ """
68
+ if not file.filename or not file.filename.lower().endswith('.pdf'):
69
+ raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
70
+
71
  content = await file.read()
72
+ temp_pdf_path = None
73
+
74
  try:
75
  # Save the uploaded PDF to a temporary file
76
  with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
 
82
 
83
  # Convert result to dictionary
84
  output = {
85
+ "filename": file.filename,
86
  "pages": []
87
  }
88
 
 
98
 
99
  output["pages"].append(page_data)
100
 
 
 
 
101
  return {"result": output}
102
+
103
  except Exception as e:
104
+ error_detail = str(e)
105
+ error_trace = traceback.format_exc()
106
+
107
+ # Log the error (would be better with a proper logger)
108
+ print(f"Error processing PDF: {error_detail}")
109
+ print(error_trace)
110
+
111
+ return JSONResponse(
112
+ status_code=500,
113
+ content={
114
+ "error": "Error processing PDF",
115
+ "detail": error_detail,
116
+ "filename": file.filename if file and hasattr(file, 'filename') else None
117
+ }
118
+ )
119
+
120
+ finally:
121
+ # Clean up the temporary file
122
+ if temp_pdf_path and os.path.exists(temp_pdf_path):
123
+ try:
124
+ os.unlink(temp_pdf_path)
125
+ except Exception:
126
+ pass
127
+
128
+ if __name__ == "__main__":
129
+ uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
- fastapi
2
- uvicorn
3
  magic-pdf[full]==1.3.10
4
- python-multipart
 
 
1
+ fastapi==0.100.0
2
+ uvicorn==0.23.2
3
  magic-pdf[full]==1.3.10
4
+ python-multipart==0.0.6
5
+ requests==2.31.0