hellorahulk commited on
Commit
6c30c7d
·
1 Parent(s): 6ef7758

Fix Docling import and usage

Browse files
Files changed (1) hide show
  1. dockling_parser/parser.py +9 -10
dockling_parser/parser.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  from pathlib import Path
3
  from typing import Optional, Dict, Any, Union
4
  import magic
5
- import docling as dl
6
  from datetime import datetime
7
 
8
  from .types import ParsedDocument, DocumentMetadata
@@ -23,7 +23,6 @@ class DocumentParser:
23
 
24
  def __init__(self, config: Optional[Dict[str, Any]] = None):
25
  self.config = config or {}
26
- self.docling = dl.Docling()
27
 
28
  def parse(self, file_path: Union[str, Path]) -> ParsedDocument:
29
  """
@@ -60,19 +59,19 @@ class DocumentParser:
60
  )
61
 
62
  # Parse document using Docling
63
- doc = self.docling.parse(str(file_path))
64
 
65
  # Extract content and structure
66
  content = doc.text
67
  structured_content = {
68
- 'sections': doc.sections,
69
- 'paragraphs': doc.paragraphs,
70
- 'entities': doc.entities,
71
- 'metadata': doc.metadata
72
  }
73
 
74
  # Update metadata with document-specific information
75
- if doc.metadata:
76
  metadata.title = doc.metadata.get('title')
77
  metadata.author = doc.metadata.get('author')
78
  metadata.pages = doc.metadata.get('pages')
@@ -81,9 +80,9 @@ class DocumentParser:
81
  return ParsedDocument(
82
  content=content,
83
  metadata=metadata,
84
- raw_text=doc.raw_text,
85
  structured_content=structured_content,
86
- confidence_score=doc.confidence if hasattr(doc, 'confidence') else 1.0
87
  )
88
 
89
  except Exception as e:
 
2
  from pathlib import Path
3
  from typing import Optional, Dict, Any, Union
4
  import magic
5
+ from docling import DoclingDocument
6
  from datetime import datetime
7
 
8
  from .types import ParsedDocument, DocumentMetadata
 
23
 
24
  def __init__(self, config: Optional[Dict[str, Any]] = None):
25
  self.config = config or {}
 
26
 
27
  def parse(self, file_path: Union[str, Path]) -> ParsedDocument:
28
  """
 
59
  )
60
 
61
  # Parse document using Docling
62
+ doc = DoclingDocument.from_file(str(file_path))
63
 
64
  # Extract content and structure
65
  content = doc.text
66
  structured_content = {
67
+ 'sections': doc.sections if hasattr(doc, 'sections') else [],
68
+ 'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [],
69
+ 'entities': doc.entities if hasattr(doc, 'entities') else {},
70
+ 'metadata': doc.metadata if hasattr(doc, 'metadata') else {}
71
  }
72
 
73
  # Update metadata with document-specific information
74
+ if hasattr(doc, 'metadata') and doc.metadata:
75
  metadata.title = doc.metadata.get('title')
76
  metadata.author = doc.metadata.get('author')
77
  metadata.pages = doc.metadata.get('pages')
 
80
  return ParsedDocument(
81
  content=content,
82
  metadata=metadata,
83
+ raw_text=doc.raw_text if hasattr(doc, 'raw_text') else None,
84
  structured_content=structured_content,
85
+ confidence_score=getattr(doc, 'confidence', 1.0)
86
  )
87
 
88
  except Exception as e: