hellorahulk commited on
Commit
1880d31
·
1 Parent(s): 3aa9da6

Fix file handling for binary uploads

Browse files
Files changed (1) hide show
  1. app.py +34 -9
app.py CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
4
  from dockling_parser import DocumentParser
5
  from dockling_parser.exceptions import ParserError
6
  import tempfile
 
7
 
8
  TITLE = "📄 Smart Document Parser"
9
  DESCRIPTION = """
@@ -25,20 +26,44 @@ Made with ❤️ using Docling and Gradio
25
  # Initialize the document parser
26
  parser = DocumentParser()
27
 
28
- def process_document(file):
 
 
 
 
 
 
 
 
 
 
 
29
  """Process uploaded document and return structured information"""
 
30
  try:
31
- # Create a temporary file to handle the upload
32
- with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.name)[1]) as tmp_file:
33
- tmp_file.write(file.read())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  temp_path = tmp_file.name
35
 
36
  # Parse the document
37
  result = parser.parse(temp_path)
38
 
39
- # Clean up temporary file
40
- os.unlink(temp_path)
41
-
42
  # Prepare the outputs
43
  metadata_df = pd.DataFrame([{
44
  "Property": k,
@@ -79,8 +104,8 @@ def process_document(file):
79
  "Confidence Score: 0.0"
80
  )
81
  finally:
82
- # Ensure temporary file is cleaned up
83
- if 'temp_path' in locals() and os.path.exists(temp_path):
84
  try:
85
  os.unlink(temp_path)
86
  except:
 
4
  from dockling_parser import DocumentParser
5
  from dockling_parser.exceptions import ParserError
6
  import tempfile
7
+ import mimetypes
8
 
9
  TITLE = "📄 Smart Document Parser"
10
  DESCRIPTION = """
 
26
  # Initialize the document parser
27
  parser = DocumentParser()
28
 
29
+ def get_file_extension(file_type):
30
+ """Get file extension based on MIME type"""
31
+ extensions = {
32
+ 'application/pdf': '.pdf',
33
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
34
+ 'text/plain': '.txt',
35
+ 'text/html': '.html',
36
+ 'text/markdown': '.md'
37
+ }
38
+ return extensions.get(file_type, '.tmp')
39
+
40
+ def process_document(file_obj):
41
  """Process uploaded document and return structured information"""
42
+ temp_path = None
43
  try:
44
+ # Handle file upload based on type
45
+ if isinstance(file_obj, dict):
46
+ # Get file data and original name
47
+ file_data = file_obj['data']
48
+ original_name = file_obj.get('name', 'uploaded_file')
49
+ file_type = file_obj.get('mime_type', mimetypes.guess_type(original_name)[0])
50
+ extension = os.path.splitext(original_name)[1] or get_file_extension(file_type)
51
+ else:
52
+ # Handle binary data directly
53
+ file_data = file_obj
54
+ extension = '.pdf' # Default to PDF for binary uploads
55
+
56
+ # Create temporary file
57
+ with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file:
58
+ if isinstance(file_data, bytes):
59
+ tmp_file.write(file_data)
60
+ else:
61
+ tmp_file.write(file_data.read())
62
  temp_path = tmp_file.name
63
 
64
  # Parse the document
65
  result = parser.parse(temp_path)
66
 
 
 
 
67
  # Prepare the outputs
68
  metadata_df = pd.DataFrame([{
69
  "Property": k,
 
104
  "Confidence Score: 0.0"
105
  )
106
  finally:
107
+ # Clean up temporary file
108
+ if temp_path and os.path.exists(temp_path):
109
  try:
110
  os.unlink(temp_path)
111
  except: