Spaces:
Running
Running
Commit
·
6c30c7d
1
Parent(s):
6ef7758
Fix Docling import and usage
Browse files- dockling_parser/parser.py +9 -10
dockling_parser/parser.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
from pathlib import Path
|
3 |
from typing import Optional, Dict, Any, Union
|
4 |
import magic
|
5 |
-
|
6 |
from datetime import datetime
|
7 |
|
8 |
from .types import ParsedDocument, DocumentMetadata
|
@@ -23,7 +23,6 @@ class DocumentParser:
|
|
23 |
|
24 |
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
25 |
self.config = config or {}
|
26 |
-
self.docling = dl.Docling()
|
27 |
|
28 |
def parse(self, file_path: Union[str, Path]) -> ParsedDocument:
|
29 |
"""
|
@@ -60,19 +59,19 @@ class DocumentParser:
|
|
60 |
)
|
61 |
|
62 |
# Parse document using Docling
|
63 |
-
doc =
|
64 |
|
65 |
# Extract content and structure
|
66 |
content = doc.text
|
67 |
structured_content = {
|
68 |
-
'sections': doc.sections,
|
69 |
-
'paragraphs': doc.paragraphs,
|
70 |
-
'entities': doc.entities,
|
71 |
-
'metadata': doc.metadata
|
72 |
}
|
73 |
|
74 |
# Update metadata with document-specific information
|
75 |
-
if doc.metadata:
|
76 |
metadata.title = doc.metadata.get('title')
|
77 |
metadata.author = doc.metadata.get('author')
|
78 |
metadata.pages = doc.metadata.get('pages')
|
@@ -81,9 +80,9 @@ class DocumentParser:
|
|
81 |
return ParsedDocument(
|
82 |
content=content,
|
83 |
metadata=metadata,
|
84 |
-
raw_text=doc.raw_text,
|
85 |
structured_content=structured_content,
|
86 |
-
confidence_score=
|
87 |
)
|
88 |
|
89 |
except Exception as e:
|
|
|
2 |
from pathlib import Path
|
3 |
from typing import Optional, Dict, Any, Union
|
4 |
import magic
|
5 |
+
from docling import DoclingDocument
|
6 |
from datetime import datetime
|
7 |
|
8 |
from .types import ParsedDocument, DocumentMetadata
|
|
|
23 |
|
24 |
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
25 |
self.config = config or {}
|
|
|
26 |
|
27 |
def parse(self, file_path: Union[str, Path]) -> ParsedDocument:
|
28 |
"""
|
|
|
59 |
)
|
60 |
|
61 |
# Parse document using Docling
|
62 |
+
doc = DoclingDocument.from_file(str(file_path))
|
63 |
|
64 |
# Extract content and structure
|
65 |
content = doc.text
|
66 |
structured_content = {
|
67 |
+
'sections': doc.sections if hasattr(doc, 'sections') else [],
|
68 |
+
'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [],
|
69 |
+
'entities': doc.entities if hasattr(doc, 'entities') else {},
|
70 |
+
'metadata': doc.metadata if hasattr(doc, 'metadata') else {}
|
71 |
}
|
72 |
|
73 |
# Update metadata with document-specific information
|
74 |
+
if hasattr(doc, 'metadata') and doc.metadata:
|
75 |
metadata.title = doc.metadata.get('title')
|
76 |
metadata.author = doc.metadata.get('author')
|
77 |
metadata.pages = doc.metadata.get('pages')
|
|
|
80 |
return ParsedDocument(
|
81 |
content=content,
|
82 |
metadata=metadata,
|
83 |
+
raw_text=doc.raw_text if hasattr(doc, 'raw_text') else None,
|
84 |
structured_content=structured_content,
|
85 |
+
confidence_score=getattr(doc, 'confidence', 1.0)
|
86 |
)
|
87 |
|
88 |
except Exception as e:
|