marcosremar2 commited on
Commit
2751dee
·
1 Parent(s): a3cafa2

Update PDF to Markdown converter API with NVIDIA L4 support

Browse files
app/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (373 Bytes). View file
 
app/__pycache__/main.cpython-311.pyc ADDED
Binary file (7.45 kB). View file
 
app/main.py CHANGED
@@ -15,11 +15,16 @@ import torch
15
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16
  from pdf_converter import convert_pdf_to_md
17
 
 
 
 
 
 
 
18
  # Create output directory if it doesn't exist
19
- output_dir = "/app/output"
20
- images_dir = "/app/output/images"
21
  os.makedirs(output_dir, exist_ok=True)
22
  os.makedirs(images_dir, exist_ok=True)
 
23
 
24
  # Application metadata
25
  app_description = """
@@ -48,7 +53,7 @@ app.add_middleware(
48
  )
49
 
50
  # Mount the output directory as static files
51
- app.mount("/output", StaticFiles(directory="/app/output"), name="output")
52
 
53
  # Health check endpoint
54
  @app.get("/health", tags=["Health"])
@@ -96,16 +101,20 @@ async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
96
 
97
  # Get the base name of the file
98
  filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0]
99
- output_md_file = f"/app/output/{filename_without_ext}.md"
 
100
 
101
  # Process the PDF using marker
102
  md_content = convert_pdf_to_md.convert_pdf(temp_pdf_path, output_md_file)
103
 
 
 
 
104
  return {
105
  "filename": file.filename,
106
  "status": "success",
107
  "markdown_content": md_content,
108
- "output_file": f"/output/{filename_without_ext}.md"
109
  }
110
 
111
  except Exception as e:
 
15
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16
  from pdf_converter import convert_pdf_to_md
17
 
18
+ # --- Configuration for output directory ---
19
+ # Always use the local output directory
20
+ output_dir = "docker_mineru/output"
21
+
22
+ images_dir = os.path.join(output_dir, "images")
23
+
24
  # Create output directory if it doesn't exist
 
 
25
  os.makedirs(output_dir, exist_ok=True)
26
  os.makedirs(images_dir, exist_ok=True)
27
+ # --- End Configuration ---
28
 
29
  # Application metadata
30
  app_description = """
 
53
  )
54
 
55
  # Mount the output directory as static files
56
+ app.mount("/output", StaticFiles(directory=output_dir), name="output")
57
 
58
  # Health check endpoint
59
  @app.get("/health", tags=["Health"])
 
101
 
102
  # Get the base name of the file
103
  filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0]
104
+ # Use the configured output_dir
105
+ output_md_file = os.path.join(output_dir, f"{filename_without_ext}.md")
106
 
107
  # Process the PDF using marker
108
  md_content = convert_pdf_to_md.convert_pdf(temp_pdf_path, output_md_file)
109
 
110
+ # Construct the relative path for the response
111
+ relative_output_path = os.path.join("/output", f"{filename_without_ext}.md")
112
+
113
  return {
114
  "filename": file.filename,
115
  "status": "success",
116
  "markdown_content": md_content,
117
+ "output_file": relative_output_path
118
  }
119
 
120
  except Exception as e:
pdf_converter/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (301 Bytes). View file
 
pdf_converter/__pycache__/convert_pdf_to_md.cpython-311.pyc ADDED
Binary file (3.23 kB). View file
 
pdf_converter/convert_pdf_to_md.py CHANGED
@@ -22,8 +22,8 @@ def convert_pdf(pdf_input_path, output_md_path=None):
22
  print(f"Starting conversion of '{pdf_input_path}'...")
23
 
24
  try:
25
- # Create configuration
26
- config_parser = ConfigParser({})
27
 
28
  # Load models
29
  models = create_model_dict()
@@ -41,8 +41,8 @@ def convert_pdf(pdf_input_path, output_md_path=None):
41
  # Convert the PDF to markdown using marker
42
  result = converter(pdf_input_path)
43
 
44
- # The converter returns a dictionary with the markdown content
45
- markdown_text = result.get('markdown', '')
46
 
47
  # If output path is provided, save the markdown
48
  if output_md_path:
 
22
  print(f"Starting conversion of '{pdf_input_path}'...")
23
 
24
  try:
25
+ # Create configuration, explicitly setting output format
26
+ config_parser = ConfigParser({'output_format': 'markdown'})
27
 
28
  # Load models
29
  models = create_model_dict()
 
41
  # Convert the PDF to markdown using marker
42
  result = converter(pdf_input_path)
43
 
44
+ # Access the markdown content directly from the result object
45
+ markdown_text = result.markdown
46
 
47
  # If output path is provided, save the markdown
48
  if output_md_path: