Spaces:

marcosremar2
/

docker_mineru

Sleeping

marcosremar2 commited on 27 days ago

Commit

2751dee

1 Parent(s): a3cafa2

Update PDF to Markdown converter API with NVIDIA L4 support

Files changed (6) hide show

app/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (373 Bytes). View file

app/__pycache__/main.cpython-311.pyc ADDED Viewed

Binary file (7.45 kB). View file

app/main.py CHANGED Viewed

@@ -15,11 +15,16 @@ import torch
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from pdf_converter import convert_pdf_to_md
 # Create output directory if it doesn't exist
-output_dir = "/app/output"
-images_dir = "/app/output/images"
 os.makedirs(output_dir, exist_ok=True)
 os.makedirs(images_dir, exist_ok=True)
 # Application metadata
 app_description = """
@@ -48,7 +53,7 @@ app.add_middleware(
 )
 # Mount the output directory as static files
-app.mount("/output", StaticFiles(directory="/app/output"), name="output")
 # Health check endpoint
 @app.get("/health", tags=["Health"])
@@ -96,16 +101,20 @@ async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
         # Get the base name of the file
         filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0]
-        output_md_file = f"/app/output/{filename_without_ext}.md"
         # Process the PDF using marker
         md_content = convert_pdf_to_md.convert_pdf(temp_pdf_path, output_md_file)
         return {
             "filename": file.filename,
             "status": "success",
             "markdown_content": md_content,
-            "output_file": f"/output/{filename_without_ext}.md"
         }
     except Exception as e:

 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from pdf_converter import convert_pdf_to_md
+# --- Configuration for output directory ---
+# Always use the local output directory
+output_dir = "docker_mineru/output"
+images_dir = os.path.join(output_dir, "images")
 # Create output directory if it doesn't exist
 os.makedirs(output_dir, exist_ok=True)
 os.makedirs(images_dir, exist_ok=True)
+# --- End Configuration ---
 # Application metadata
 app_description = """
 )
 # Mount the output directory as static files
+app.mount("/output", StaticFiles(directory=output_dir), name="output")
 # Health check endpoint
 @app.get("/health", tags=["Health"])
         # Get the base name of the file
         filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0]
+        # Use the configured output_dir
+        output_md_file = os.path.join(output_dir, f"{filename_without_ext}.md")
         # Process the PDF using marker
         md_content = convert_pdf_to_md.convert_pdf(temp_pdf_path, output_md_file)
+        # Construct the relative path for the response
+        relative_output_path = os.path.join("/output", f"{filename_without_ext}.md")
         return {
             "filename": file.filename,
             "status": "success",
             "markdown_content": md_content,
+            "output_file": relative_output_path
         }
     except Exception as e:

pdf_converter/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (301 Bytes). View file

pdf_converter/__pycache__/convert_pdf_to_md.cpython-311.pyc ADDED Viewed

Binary file (3.23 kB). View file

pdf_converter/convert_pdf_to_md.py CHANGED Viewed

@@ -22,8 +22,8 @@ def convert_pdf(pdf_input_path, output_md_path=None):
     print(f"Starting conversion of '{pdf_input_path}'...")
     try:
-        # Create configuration
-        config_parser = ConfigParser({})
         # Load models
         models = create_model_dict()
@@ -41,8 +41,8 @@ def convert_pdf(pdf_input_path, output_md_path=None):
         # Convert the PDF to markdown using marker
         result = converter(pdf_input_path)
-        # The converter returns a dictionary with the markdown content
-        markdown_text = result.get('markdown', '')
         # If output path is provided, save the markdown
         if output_md_path:

     print(f"Starting conversion of '{pdf_input_path}'...")
     try:
+        # Create configuration, explicitly setting output format
+        config_parser = ConfigParser({'output_format': 'markdown'})
         # Load models
         models = create_model_dict()
         # Convert the PDF to markdown using marker
         result = converter(pdf_input_path)
+        # Access the markdown content directly from the result object
+        markdown_text = result.markdown
         # If output path is provided, save the markdown
         if output_md_path: