hellorahulk commited on
Commit
070e4b3
·
1 Parent(s): 1880d31

Fix document text extraction using proper Docling methods

Browse files
Files changed (1) hide show
  1. dockling_parser/parser.py +11 -3
dockling_parser/parser.py CHANGED
@@ -63,8 +63,10 @@ class DocumentParser:
63
  result = self.converter.convert(str(file_path))
64
  doc = result.document
65
 
66
- # Extract content and structure
67
- content = doc.text
 
 
68
  structured_content = {
69
  'sections': doc.sections if hasattr(doc, 'sections') else [],
70
  'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [],
@@ -72,6 +74,12 @@ class DocumentParser:
72
  'metadata': doc.metadata if hasattr(doc, 'metadata') else {}
73
  }
74
 
 
 
 
 
 
 
75
  # Update metadata with document-specific information
76
  if hasattr(doc, 'metadata') and doc.metadata:
77
  metadata.title = doc.metadata.get('title')
@@ -82,7 +90,7 @@ class DocumentParser:
82
  return ParsedDocument(
83
  content=content,
84
  metadata=metadata,
85
- raw_text=doc.raw_text if hasattr(doc, 'raw_text') else None,
86
  structured_content=structured_content,
87
  confidence_score=getattr(doc, 'confidence', 1.0)
88
  )
 
63
  result = self.converter.convert(str(file_path))
64
  doc = result.document
65
 
66
+ # Extract content using proper methods
67
+ content = doc.export_to_text()
68
+
69
+ # Extract structured content
70
  structured_content = {
71
  'sections': doc.sections if hasattr(doc, 'sections') else [],
72
  'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [],
 
74
  'metadata': doc.metadata if hasattr(doc, 'metadata') else {}
75
  }
76
 
77
+ # Get raw text if available
78
+ try:
79
+ raw_text = doc.export_to_text(include_layout=True)
80
+ except:
81
+ raw_text = content
82
+
83
  # Update metadata with document-specific information
84
  if hasattr(doc, 'metadata') and doc.metadata:
85
  metadata.title = doc.metadata.get('title')
 
90
  return ParsedDocument(
91
  content=content,
92
  metadata=metadata,
93
+ raw_text=raw_text,
94
  structured_content=structured_content,
95
  confidence_score=getattr(doc, 'confidence', 1.0)
96
  )