hellorahulk commited on
Commit
fdbfd73
·
1 Parent(s): 8c92c5f

Improve error handling and file processing

Browse files
Files changed (2) hide show
  1. app.py +42 -12
  2. dockling_parser/parser.py +93 -51
app.py CHANGED
@@ -2,9 +2,10 @@ import os
2
  import gradio as gr
3
  import pandas as pd
4
  from dockling_parser import DocumentParser
5
- from dockling_parser.exceptions import ParserError
6
  import tempfile
7
  import mimetypes
 
8
 
9
  TITLE = "📄 Smart Document Parser"
10
  DESCRIPTION = """
@@ -23,19 +24,37 @@ ARTICLE = """
23
  Made with ❤️ using Docling and Gradio
24
  """
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  # Initialize the document parser
27
  parser = DocumentParser()
28
 
29
  def process_document(file_path):
30
  """Process uploaded document and return structured information"""
31
  if file_path is None:
32
- return (
33
- "Error: No file uploaded",
34
- pd.DataFrame(),
35
- "No sections available",
36
- "No entities available",
37
- "Confidence Score: 0.0"
38
- )
39
 
40
  try:
41
  # Parse the document directly using the file path
@@ -64,18 +83,29 @@ def process_document(file_path):
64
  f"Confidence Score: {result.confidence_score:.2f}" # Confidence score
65
  )
66
 
 
 
 
 
 
 
 
 
 
67
  except ParserError as e:
 
68
  return (
69
- f"Error parsing document: {str(e)}",
70
- pd.DataFrame(),
71
  "No sections available",
72
  "No entities available",
73
  "Confidence Score: 0.0"
74
  )
75
  except Exception as e:
 
76
  return (
77
- f"Unexpected error: {str(e)}",
78
- pd.DataFrame(),
79
  "No sections available",
80
  "No entities available",
81
  "Confidence Score: 0.0"
 
2
  import gradio as gr
3
  import pandas as pd
4
  from dockling_parser import DocumentParser
5
+ from dockling_parser.exceptions import ParserError, UnsupportedFormatError
6
  import tempfile
7
  import mimetypes
8
+ import traceback
9
 
10
  TITLE = "📄 Smart Document Parser"
11
  DESCRIPTION = """
 
24
  Made with ❤️ using Docling and Gradio
25
  """
26
 
27
+ ERROR_MESSAGES = {
28
+ "no_file": (
29
+ "⚠️ No file uploaded",
30
+ "Please upload a document to process.",
31
+ "No sections available",
32
+ "No entities available",
33
+ "Confidence Score: 0.0"
34
+ ),
35
+ "unsupported_format": (
36
+ "⚠️ Unsupported file format",
37
+ "Please upload a file in one of the supported formats: PDF, DOCX, TXT, HTML, or MD.",
38
+ "No sections available",
39
+ "No entities available",
40
+ "Confidence Score: 0.0"
41
+ ),
42
+ "processing_error": (
43
+ "⚠️ Error processing document",
44
+ "An error occurred while processing the document. Please try again with a different file.",
45
+ "No sections available",
46
+ "No entities available",
47
+ "Confidence Score: 0.0"
48
+ )
49
+ }
50
+
51
  # Initialize the document parser
52
  parser = DocumentParser()
53
 
54
  def process_document(file_path):
55
  """Process uploaded document and return structured information"""
56
  if file_path is None:
57
+ return ERROR_MESSAGES["no_file"]
 
 
 
 
 
 
58
 
59
  try:
60
  # Parse the document directly using the file path
 
83
  f"Confidence Score: {result.confidence_score:.2f}" # Confidence score
84
  )
85
 
86
+ except UnsupportedFormatError as e:
87
+ error_msg = f"⚠️ {str(e)}"
88
+ return (
89
+ error_msg,
90
+ pd.DataFrame([{"Property": "Error", "Value": error_msg}]),
91
+ "No sections available",
92
+ "No entities available",
93
+ "Confidence Score: 0.0"
94
+ )
95
  except ParserError as e:
96
+ error_msg = f"⚠️ {str(e)}"
97
  return (
98
+ error_msg,
99
+ pd.DataFrame([{"Property": "Error", "Value": error_msg}]),
100
  "No sections available",
101
  "No entities available",
102
  "Confidence Score: 0.0"
103
  )
104
  except Exception as e:
105
+ error_msg = f"⚠️ Unexpected error: {str(e)}\n{traceback.format_exc()}"
106
  return (
107
+ error_msg,
108
+ pd.DataFrame([{"Property": "Error", "Value": error_msg}]),
109
  "No sections available",
110
  "No entities available",
111
  "Confidence Score: 0.0"
dockling_parser/parser.py CHANGED
@@ -4,6 +4,8 @@ from typing import Optional, Dict, Any, Union
4
  import magic
5
  from docling.document_converter import DocumentConverter
6
  from datetime import datetime
 
 
7
 
8
  from .types import ParsedDocument, DocumentMetadata
9
  from .exceptions import UnsupportedFormatError, ParseError
@@ -40,20 +42,17 @@ class DocumentParser:
40
  def __init__(self, config: Optional[Dict[str, Any]] = None):
41
  self.config = config or {}
42
  self.converter = DocumentConverter()
 
 
43
 
44
- def parse(self, file_path: Union[str, Path]) -> ParsedDocument:
 
 
 
 
 
45
  """
46
- Parse a document file and return structured content
47
-
48
- Args:
49
- file_path: Path to the document file
50
-
51
- Returns:
52
- ParsedDocument object containing parsed content and metadata
53
-
54
- Raises:
55
- UnsupportedFormatError: If the file format is not supported
56
- ParseError: If parsing fails
57
  """
58
  file_path = Path(file_path)
59
  if not file_path.exists():
@@ -66,15 +65,43 @@ class DocumentParser:
66
  # If extension not recognized, use magic
67
  if not mime_type:
68
  mime_type = magic.from_file(str(file_path), mime=True)
 
 
 
 
 
 
 
69
 
70
- if mime_type not in self.SUPPORTED_FORMATS:
71
- raise UnsupportedFormatError(f"Unsupported file format: {mime_type}")
 
 
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  try:
 
 
 
74
  # Get file metadata
75
- stats = file_path.stat()
 
 
76
  metadata = DocumentMetadata(
77
- filename=file_path.name,
78
  file_type=self.SUPPORTED_FORMATS[mime_type],
79
  size_bytes=stats.st_size,
80
  created_at=datetime.fromtimestamp(stats.st_ctime),
@@ -82,44 +109,59 @@ class DocumentParser:
82
  mime_type=mime_type
83
  )
84
 
85
- # Parse document using Docling
86
- result = self.converter.convert(str(file_path))
87
- doc = result.document
88
-
89
- # Extract content using proper methods
90
- content = doc.export_to_text()
91
-
92
- # Extract structured content
93
- structured_content = {
94
- 'sections': doc.sections if hasattr(doc, 'sections') else [],
95
- 'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [],
96
- 'entities': doc.entities if hasattr(doc, 'entities') else {},
97
- 'metadata': doc.metadata if hasattr(doc, 'metadata') else {}
98
- }
99
-
100
- # Get raw text if available
101
  try:
102
- raw_text = doc.export_to_text(include_layout=True)
103
- except:
104
- raw_text = content
105
-
106
- # Update metadata with document-specific information
107
- if hasattr(doc, 'metadata') and doc.metadata:
108
- metadata.title = doc.metadata.get('title')
109
- metadata.author = doc.metadata.get('author')
110
- metadata.pages = doc.metadata.get('pages')
111
- metadata.extra.update(doc.metadata)
112
-
113
- return ParsedDocument(
114
- content=content,
115
- metadata=metadata,
116
- raw_text=raw_text,
117
- structured_content=structured_content,
118
- confidence_score=getattr(doc, 'confidence', 1.0)
119
- )
 
 
 
 
 
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  except Exception as e:
122
- raise ParseError(f"Failed to parse document: {str(e)}") from e
 
 
 
 
 
 
 
 
123
 
124
  def supports_format(self, mime_type: str) -> bool:
125
  """Check if a given MIME type is supported"""
 
4
  import magic
5
  from docling.document_converter import DocumentConverter
6
  from datetime import datetime
7
+ import shutil
8
+ import tempfile
9
 
10
  from .types import ParsedDocument, DocumentMetadata
11
  from .exceptions import UnsupportedFormatError, ParseError
 
42
  def __init__(self, config: Optional[Dict[str, Any]] = None):
43
  self.config = config or {}
44
  self.converter = DocumentConverter()
45
+ # Create a temporary directory for processing files
46
+ self.temp_dir = Path(tempfile.mkdtemp(prefix="dockling_"))
47
 
48
+ def __del__(self):
49
+ """Cleanup temporary directory on object destruction"""
50
+ if hasattr(self, 'temp_dir') and self.temp_dir.exists():
51
+ shutil.rmtree(self.temp_dir, ignore_errors=True)
52
+
53
+ def _validate_and_copy_file(self, file_path: Union[str, Path]) -> Path:
54
  """
55
+ Validate file and copy to temporary location with correct extension
 
 
 
 
 
 
 
 
 
 
56
  """
57
  file_path = Path(file_path)
58
  if not file_path.exists():
 
65
  # If extension not recognized, use magic
66
  if not mime_type:
67
  mime_type = magic.from_file(str(file_path), mime=True)
68
+ if mime_type in self.SUPPORTED_FORMATS:
69
+ extension = f".{self.SUPPORTED_FORMATS[mime_type]}"
70
+ else:
71
+ raise UnsupportedFormatError(
72
+ f"Unsupported file format: {mime_type}. "
73
+ f"Supported formats are: {', '.join(set(self.SUPPORTED_FORMATS.values()))}"
74
+ )
75
 
76
+ # Copy file to temp directory with correct extension
77
+ temp_file = self.temp_dir / f"doc{extension}"
78
+ shutil.copy2(file_path, temp_file)
79
+ return temp_file
80
 
81
+ def parse(self, file_path: Union[str, Path]) -> ParsedDocument:
82
+ """
83
+ Parse a document file and return structured content
84
+
85
+ Args:
86
+ file_path: Path to the document file
87
+
88
+ Returns:
89
+ ParsedDocument object containing parsed content and metadata
90
+
91
+ Raises:
92
+ UnsupportedFormatError: If the file format is not supported
93
+ ParseError: If parsing fails
94
+ """
95
  try:
96
+ # Validate and prepare file
97
+ temp_file = self._validate_and_copy_file(file_path)
98
+
99
  # Get file metadata
100
+ stats = temp_file.stat()
101
+ mime_type = magic.from_file(str(temp_file), mime=True)
102
+
103
  metadata = DocumentMetadata(
104
+ filename=Path(file_path).name, # Use original filename
105
  file_type=self.SUPPORTED_FORMATS[mime_type],
106
  size_bytes=stats.st_size,
107
  created_at=datetime.fromtimestamp(stats.st_ctime),
 
109
  mime_type=mime_type
110
  )
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  try:
113
+ # Parse document using Docling
114
+ result = self.converter.convert(str(temp_file))
115
+ doc = result.document
116
+
117
+ # Extract content using proper methods
118
+ try:
119
+ content = doc.export_to_text()
120
+ except Exception as e:
121
+ raise ParseError(f"Failed to extract text content: {str(e)}")
122
+
123
+ # Extract structured content
124
+ structured_content = {
125
+ 'sections': doc.sections if hasattr(doc, 'sections') else [],
126
+ 'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [],
127
+ 'entities': doc.entities if hasattr(doc, 'entities') else {},
128
+ 'metadata': doc.metadata if hasattr(doc, 'metadata') else {}
129
+ }
130
+
131
+ # Get raw text if available
132
+ try:
133
+ raw_text = doc.export_to_text(include_layout=True)
134
+ except:
135
+ raw_text = content
136
 
137
+ # Update metadata with document-specific information
138
+ if hasattr(doc, 'metadata') and doc.metadata:
139
+ metadata.title = doc.metadata.get('title')
140
+ metadata.author = doc.metadata.get('author')
141
+ metadata.pages = doc.metadata.get('pages')
142
+ metadata.extra.update(doc.metadata)
143
+
144
+ return ParsedDocument(
145
+ content=content,
146
+ metadata=metadata,
147
+ raw_text=raw_text,
148
+ structured_content=structured_content,
149
+ confidence_score=getattr(doc, 'confidence', 1.0)
150
+ )
151
+
152
+ except Exception as e:
153
+ raise ParseError(f"Failed to parse document: {str(e)}")
154
+
155
  except Exception as e:
156
+ raise ParseError(str(e))
157
+
158
+ finally:
159
+ # Cleanup temporary files
160
+ if 'temp_file' in locals() and temp_file.exists():
161
+ try:
162
+ temp_file.unlink()
163
+ except:
164
+ pass
165
 
166
  def supports_format(self, mime_type: str) -> bool:
167
  """Check if a given MIME type is supported"""