hellorahulk commited on
Commit
8c92c5f
·
1 Parent(s): 35d97d7

Fix file handling and MIME type detection

Browse files
Files changed (2) hide show
  1. app.py +5 -29
  2. dockling_parser/parser.py +25 -2
app.py CHANGED
@@ -26,9 +26,9 @@ Made with ❤️ using Docling and Gradio
26
  # Initialize the document parser
27
  parser = DocumentParser()
28
 
29
- def process_document(file_obj):
30
  """Process uploaded document and return structured information"""
31
- if file_obj is None:
32
  return (
33
  "Error: No file uploaded",
34
  pd.DataFrame(),
@@ -37,26 +37,9 @@ def process_document(file_obj):
37
  "Confidence Score: 0.0"
38
  )
39
 
40
- temp_path = None
41
  try:
42
- # Create temporary file with appropriate extension
43
- original_filename = file_obj.name if hasattr(file_obj, 'name') else "uploaded_file.pdf"
44
- extension = os.path.splitext(original_filename)[1].lower()
45
- if not extension:
46
- extension = '.pdf' # Default to PDF if no extension
47
-
48
- # Create temporary file and write content
49
- with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file:
50
- # Write the content
51
- content = file_obj.read() if hasattr(file_obj, 'read') else file_obj
52
- if isinstance(content, bytes):
53
- tmp_file.write(content)
54
- else:
55
- tmp_file.write(content.encode('utf-8'))
56
- temp_path = tmp_file.name
57
-
58
- # Parse the document
59
- result = parser.parse(temp_path)
60
 
61
  # Prepare the outputs
62
  metadata_df = pd.DataFrame([{
@@ -97,13 +80,6 @@ def process_document(file_obj):
97
  "No entities available",
98
  "Confidence Score: 0.0"
99
  )
100
- finally:
101
- # Clean up temporary file
102
- if temp_path and os.path.exists(temp_path):
103
- try:
104
- os.unlink(temp_path)
105
- except:
106
- pass
107
 
108
  # Create Gradio interface
109
  with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
@@ -115,7 +91,7 @@ with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
115
  file_input = gr.File(
116
  label="Upload Document",
117
  file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
118
- type="filepath" # Changed from binary to filepath
119
  )
120
  submit_btn = gr.Button("Process Document", variant="primary")
121
 
 
26
  # Initialize the document parser
27
  parser = DocumentParser()
28
 
29
+ def process_document(file_path):
30
  """Process uploaded document and return structured information"""
31
+ if file_path is None:
32
  return (
33
  "Error: No file uploaded",
34
  pd.DataFrame(),
 
37
  "Confidence Score: 0.0"
38
  )
39
 
 
40
  try:
41
+ # Parse the document directly using the file path
42
+ result = parser.parse(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  # Prepare the outputs
45
  metadata_df = pd.DataFrame([{
 
80
  "No entities available",
81
  "Confidence Score: 0.0"
82
  )
 
 
 
 
 
 
 
83
 
84
  # Create Gradio interface
85
  with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
 
91
  file_input = gr.File(
92
  label="Upload Document",
93
  file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
94
+ type="filepath"
95
  )
96
  submit_btn = gr.Button("Process Document", variant="primary")
97
 
dockling_parser/parser.py CHANGED
@@ -18,7 +18,23 @@ class DocumentParser:
18
  'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
19
  'text/plain': 'txt',
20
  'text/html': 'html',
21
- 'text/markdown': 'md'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  }
23
 
24
  def __init__(self, config: Optional[Dict[str, Any]] = None):
@@ -43,7 +59,14 @@ class DocumentParser:
43
  if not file_path.exists():
44
  raise FileNotFoundError(f"File not found: {file_path}")
45
 
46
- mime_type = magic.from_file(str(file_path), mime=True)
 
 
 
 
 
 
 
47
  if mime_type not in self.SUPPORTED_FORMATS:
48
  raise UnsupportedFormatError(f"Unsupported file format: {mime_type}")
49
 
 
18
  'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
19
  'text/plain': 'txt',
20
  'text/html': 'html',
21
+ 'text/markdown': 'md',
22
+ # Add common variations
23
+ 'application/x-pdf': 'pdf',
24
+ 'application/acrobat': 'pdf',
25
+ 'application/msword': 'docx',
26
+ 'text/x-markdown': 'md',
27
+ 'text/x-html': 'html'
28
+ }
29
+
30
+ EXTENSION_TO_MIME = {
31
+ '.pdf': 'application/pdf',
32
+ '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
33
+ '.txt': 'text/plain',
34
+ '.html': 'text/html',
35
+ '.htm': 'text/html',
36
+ '.md': 'text/markdown',
37
+ '.markdown': 'text/markdown'
38
  }
39
 
40
  def __init__(self, config: Optional[Dict[str, Any]] = None):
 
59
  if not file_path.exists():
60
  raise FileNotFoundError(f"File not found: {file_path}")
61
 
62
+ # Try to determine format from extension first
63
+ extension = file_path.suffix.lower()
64
+ mime_type = self.EXTENSION_TO_MIME.get(extension)
65
+
66
+ # If extension not recognized, use magic
67
+ if not mime_type:
68
+ mime_type = magic.from_file(str(file_path), mime=True)
69
+
70
  if mime_type not in self.SUPPORTED_FORMATS:
71
  raise UnsupportedFormatError(f"Unsupported file format: {mime_type}")
72