Spaces:
Running
Running
Commit
·
1880d31
1
Parent(s):
3aa9da6
Fix file handling for binary uploads
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
|
|
4 |
from dockling_parser import DocumentParser
|
5 |
from dockling_parser.exceptions import ParserError
|
6 |
import tempfile
|
|
|
7 |
|
8 |
TITLE = "📄 Smart Document Parser"
|
9 |
DESCRIPTION = """
|
@@ -25,20 +26,44 @@ Made with ❤️ using Docling and Gradio
|
|
25 |
# Initialize the document parser
|
26 |
parser = DocumentParser()
|
27 |
|
28 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
"""Process uploaded document and return structured information"""
|
|
|
30 |
try:
|
31 |
-
#
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
temp_path = tmp_file.name
|
35 |
|
36 |
# Parse the document
|
37 |
result = parser.parse(temp_path)
|
38 |
|
39 |
-
# Clean up temporary file
|
40 |
-
os.unlink(temp_path)
|
41 |
-
|
42 |
# Prepare the outputs
|
43 |
metadata_df = pd.DataFrame([{
|
44 |
"Property": k,
|
@@ -79,8 +104,8 @@ def process_document(file):
|
|
79 |
"Confidence Score: 0.0"
|
80 |
)
|
81 |
finally:
|
82 |
-
#
|
83 |
-
if
|
84 |
try:
|
85 |
os.unlink(temp_path)
|
86 |
except:
|
|
|
4 |
from dockling_parser import DocumentParser
|
5 |
from dockling_parser.exceptions import ParserError
|
6 |
import tempfile
|
7 |
+
import mimetypes
|
8 |
|
9 |
TITLE = "📄 Smart Document Parser"
|
10 |
DESCRIPTION = """
|
|
|
26 |
# Initialize the document parser
|
27 |
parser = DocumentParser()
|
28 |
|
29 |
+
def get_file_extension(file_type):
|
30 |
+
"""Get file extension based on MIME type"""
|
31 |
+
extensions = {
|
32 |
+
'application/pdf': '.pdf',
|
33 |
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
|
34 |
+
'text/plain': '.txt',
|
35 |
+
'text/html': '.html',
|
36 |
+
'text/markdown': '.md'
|
37 |
+
}
|
38 |
+
return extensions.get(file_type, '.tmp')
|
39 |
+
|
40 |
+
def process_document(file_obj):
|
41 |
"""Process uploaded document and return structured information"""
|
42 |
+
temp_path = None
|
43 |
try:
|
44 |
+
# Handle file upload based on type
|
45 |
+
if isinstance(file_obj, dict):
|
46 |
+
# Get file data and original name
|
47 |
+
file_data = file_obj['data']
|
48 |
+
original_name = file_obj.get('name', 'uploaded_file')
|
49 |
+
file_type = file_obj.get('mime_type', mimetypes.guess_type(original_name)[0])
|
50 |
+
extension = os.path.splitext(original_name)[1] or get_file_extension(file_type)
|
51 |
+
else:
|
52 |
+
# Handle binary data directly
|
53 |
+
file_data = file_obj
|
54 |
+
extension = '.pdf' # Default to PDF for binary uploads
|
55 |
+
|
56 |
+
# Create temporary file
|
57 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file:
|
58 |
+
if isinstance(file_data, bytes):
|
59 |
+
tmp_file.write(file_data)
|
60 |
+
else:
|
61 |
+
tmp_file.write(file_data.read())
|
62 |
temp_path = tmp_file.name
|
63 |
|
64 |
# Parse the document
|
65 |
result = parser.parse(temp_path)
|
66 |
|
|
|
|
|
|
|
67 |
# Prepare the outputs
|
68 |
metadata_df = pd.DataFrame([{
|
69 |
"Property": k,
|
|
|
104 |
"Confidence Score: 0.0"
|
105 |
)
|
106 |
finally:
|
107 |
+
# Clean up temporary file
|
108 |
+
if temp_path and os.path.exists(temp_path):
|
109 |
try:
|
110 |
os.unlink(temp_path)
|
111 |
except:
|