Spaces:

hellorahulk
/

docling_free

Running

App Files Files Community

hellorahulk commited on Jan 23

Commit

1880d31

1 Parent(s): 3aa9da6

Fix file handling for binary uploads

Browse files

Files changed (1) hide show

app.py +34 -9

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import pandas as pd
 from dockling_parser import DocumentParser
 from dockling_parser.exceptions import ParserError
 import tempfile
 TITLE = "📄 Smart Document Parser"
 DESCRIPTION = """
@@ -25,20 +26,44 @@ Made with ❤️ using Docling and Gradio
 # Initialize the document parser
 parser = DocumentParser()
-def process_document(file):
     """Process uploaded document and return structured information"""
     try:
-        # Create a temporary file to handle the upload
-        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.name)[1]) as tmp_file:
-            tmp_file.write(file.read())
             temp_path = tmp_file.name
         # Parse the document
         result = parser.parse(temp_path)
-        # Clean up temporary file
-        os.unlink(temp_path)
         # Prepare the outputs
         metadata_df = pd.DataFrame([{
             "Property": k,
@@ -79,8 +104,8 @@ def process_document(file):
             "Confidence Score: 0.0"
         )
     finally:
-        # Ensure temporary file is cleaned up
-        if 'temp_path' in locals() and os.path.exists(temp_path):
             try:
                 os.unlink(temp_path)
             except:

 from dockling_parser import DocumentParser
 from dockling_parser.exceptions import ParserError
 import tempfile
+import mimetypes
 TITLE = "📄 Smart Document Parser"
 DESCRIPTION = """
 # Initialize the document parser
 parser = DocumentParser()
+def get_file_extension(file_type):
+    """Get file extension based on MIME type"""
+    extensions = {
+        'application/pdf': '.pdf',
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
+        'text/plain': '.txt',
+        'text/html': '.html',
+        'text/markdown': '.md'
+    }
+    return extensions.get(file_type, '.tmp')
+def process_document(file_obj):
     """Process uploaded document and return structured information"""
+    temp_path = None
     try:
+        # Handle file upload based on type
+        if isinstance(file_obj, dict):
+            # Get file data and original name
+            file_data = file_obj['data']
+            original_name = file_obj.get('name', 'uploaded_file')
+            file_type = file_obj.get('mime_type', mimetypes.guess_type(original_name)[0])
+            extension = os.path.splitext(original_name)[1] or get_file_extension(file_type)
+        else:
+            # Handle binary data directly
+            file_data = file_obj
+            extension = '.pdf'  # Default to PDF for binary uploads
+        # Create temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file:
+            if isinstance(file_data, bytes):
+                tmp_file.write(file_data)
+            else:
+                tmp_file.write(file_data.read())
             temp_path = tmp_file.name
         # Parse the document
         result = parser.parse(temp_path)
         # Prepare the outputs
         metadata_df = pd.DataFrame([{
             "Property": k,
             "Confidence Score: 0.0"
         )
     finally:
+        # Clean up temporary file
+        if temp_path and os.path.exists(temp_path):
             try:
                 os.unlink(temp_path)
             except: