Spaces:
Running
Running
Commit
·
8c92c5f
1
Parent(s):
35d97d7
Fix file handling and MIME type detection
Browse files- app.py +5 -29
- dockling_parser/parser.py +25 -2
app.py
CHANGED
@@ -26,9 +26,9 @@ Made with ❤️ using Docling and Gradio
|
|
26 |
# Initialize the document parser
|
27 |
parser = DocumentParser()
|
28 |
|
29 |
-
def process_document(
|
30 |
"""Process uploaded document and return structured information"""
|
31 |
-
if
|
32 |
return (
|
33 |
"Error: No file uploaded",
|
34 |
pd.DataFrame(),
|
@@ -37,26 +37,9 @@ def process_document(file_obj):
|
|
37 |
"Confidence Score: 0.0"
|
38 |
)
|
39 |
|
40 |
-
temp_path = None
|
41 |
try:
|
42 |
-
#
|
43 |
-
|
44 |
-
extension = os.path.splitext(original_filename)[1].lower()
|
45 |
-
if not extension:
|
46 |
-
extension = '.pdf' # Default to PDF if no extension
|
47 |
-
|
48 |
-
# Create temporary file and write content
|
49 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file:
|
50 |
-
# Write the content
|
51 |
-
content = file_obj.read() if hasattr(file_obj, 'read') else file_obj
|
52 |
-
if isinstance(content, bytes):
|
53 |
-
tmp_file.write(content)
|
54 |
-
else:
|
55 |
-
tmp_file.write(content.encode('utf-8'))
|
56 |
-
temp_path = tmp_file.name
|
57 |
-
|
58 |
-
# Parse the document
|
59 |
-
result = parser.parse(temp_path)
|
60 |
|
61 |
# Prepare the outputs
|
62 |
metadata_df = pd.DataFrame([{
|
@@ -97,13 +80,6 @@ def process_document(file_obj):
|
|
97 |
"No entities available",
|
98 |
"Confidence Score: 0.0"
|
99 |
)
|
100 |
-
finally:
|
101 |
-
# Clean up temporary file
|
102 |
-
if temp_path and os.path.exists(temp_path):
|
103 |
-
try:
|
104 |
-
os.unlink(temp_path)
|
105 |
-
except:
|
106 |
-
pass
|
107 |
|
108 |
# Create Gradio interface
|
109 |
with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
|
@@ -115,7 +91,7 @@ with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
|
|
115 |
file_input = gr.File(
|
116 |
label="Upload Document",
|
117 |
file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
|
118 |
-
type="filepath"
|
119 |
)
|
120 |
submit_btn = gr.Button("Process Document", variant="primary")
|
121 |
|
|
|
26 |
# Initialize the document parser
|
27 |
parser = DocumentParser()
|
28 |
|
29 |
+
def process_document(file_path):
|
30 |
"""Process uploaded document and return structured information"""
|
31 |
+
if file_path is None:
|
32 |
return (
|
33 |
"Error: No file uploaded",
|
34 |
pd.DataFrame(),
|
|
|
37 |
"Confidence Score: 0.0"
|
38 |
)
|
39 |
|
|
|
40 |
try:
|
41 |
+
# Parse the document directly using the file path
|
42 |
+
result = parser.parse(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
# Prepare the outputs
|
45 |
metadata_df = pd.DataFrame([{
|
|
|
80 |
"No entities available",
|
81 |
"Confidence Score: 0.0"
|
82 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
# Create Gradio interface
|
85 |
with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
|
|
|
91 |
file_input = gr.File(
|
92 |
label="Upload Document",
|
93 |
file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
|
94 |
+
type="filepath"
|
95 |
)
|
96 |
submit_btn = gr.Button("Process Document", variant="primary")
|
97 |
|
dockling_parser/parser.py
CHANGED
@@ -18,7 +18,23 @@ class DocumentParser:
|
|
18 |
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
|
19 |
'text/plain': 'txt',
|
20 |
'text/html': 'html',
|
21 |
-
'text/markdown': 'md'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
}
|
23 |
|
24 |
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
@@ -43,7 +59,14 @@ class DocumentParser:
|
|
43 |
if not file_path.exists():
|
44 |
raise FileNotFoundError(f"File not found: {file_path}")
|
45 |
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
if mime_type not in self.SUPPORTED_FORMATS:
|
48 |
raise UnsupportedFormatError(f"Unsupported file format: {mime_type}")
|
49 |
|
|
|
18 |
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
|
19 |
'text/plain': 'txt',
|
20 |
'text/html': 'html',
|
21 |
+
'text/markdown': 'md',
|
22 |
+
# Add common variations
|
23 |
+
'application/x-pdf': 'pdf',
|
24 |
+
'application/acrobat': 'pdf',
|
25 |
+
'application/msword': 'docx',
|
26 |
+
'text/x-markdown': 'md',
|
27 |
+
'text/x-html': 'html'
|
28 |
+
}
|
29 |
+
|
30 |
+
EXTENSION_TO_MIME = {
|
31 |
+
'.pdf': 'application/pdf',
|
32 |
+
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
33 |
+
'.txt': 'text/plain',
|
34 |
+
'.html': 'text/html',
|
35 |
+
'.htm': 'text/html',
|
36 |
+
'.md': 'text/markdown',
|
37 |
+
'.markdown': 'text/markdown'
|
38 |
}
|
39 |
|
40 |
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
|
59 |
if not file_path.exists():
|
60 |
raise FileNotFoundError(f"File not found: {file_path}")
|
61 |
|
62 |
+
# Try to determine format from extension first
|
63 |
+
extension = file_path.suffix.lower()
|
64 |
+
mime_type = self.EXTENSION_TO_MIME.get(extension)
|
65 |
+
|
66 |
+
# If extension not recognized, use magic
|
67 |
+
if not mime_type:
|
68 |
+
mime_type = magic.from_file(str(file_path), mime=True)
|
69 |
+
|
70 |
if mime_type not in self.SUPPORTED_FORMATS:
|
71 |
raise UnsupportedFormatError(f"Unsupported file format: {mime_type}")
|
72 |
|