hellorahulk's picture
Fix document text extraction using proper Docling methods
070e4b3
raw
history blame
3.78 kB
import os
from pathlib import Path
from typing import Optional, Dict, Any, Union
import magic
from docling.document_converter import DocumentConverter
from datetime import datetime
from .types import ParsedDocument, DocumentMetadata
from .exceptions import UnsupportedFormatError, ParseError
class DocumentParser:
"""
A multiformat document parser using Docling
"""
SUPPORTED_FORMATS = {
'application/pdf': 'pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
'text/plain': 'txt',
'text/html': 'html',
'text/markdown': 'md'
}
def __init__(self, config: Optional[Dict[str, Any]] = None):
self.config = config or {}
self.converter = DocumentConverter()
def parse(self, file_path: Union[str, Path]) -> ParsedDocument:
"""
Parse a document file and return structured content
Args:
file_path: Path to the document file
Returns:
ParsedDocument object containing parsed content and metadata
Raises:
UnsupportedFormatError: If the file format is not supported
ParseError: If parsing fails
"""
file_path = Path(file_path)
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
mime_type = magic.from_file(str(file_path), mime=True)
if mime_type not in self.SUPPORTED_FORMATS:
raise UnsupportedFormatError(f"Unsupported file format: {mime_type}")
try:
# Get file metadata
stats = file_path.stat()
metadata = DocumentMetadata(
filename=file_path.name,
file_type=self.SUPPORTED_FORMATS[mime_type],
size_bytes=stats.st_size,
created_at=datetime.fromtimestamp(stats.st_ctime),
modified_at=datetime.fromtimestamp(stats.st_mtime),
mime_type=mime_type
)
# Parse document using Docling
result = self.converter.convert(str(file_path))
doc = result.document
# Extract content using proper methods
content = doc.export_to_text()
# Extract structured content
structured_content = {
'sections': doc.sections if hasattr(doc, 'sections') else [],
'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [],
'entities': doc.entities if hasattr(doc, 'entities') else {},
'metadata': doc.metadata if hasattr(doc, 'metadata') else {}
}
# Get raw text if available
try:
raw_text = doc.export_to_text(include_layout=True)
except:
raw_text = content
# Update metadata with document-specific information
if hasattr(doc, 'metadata') and doc.metadata:
metadata.title = doc.metadata.get('title')
metadata.author = doc.metadata.get('author')
metadata.pages = doc.metadata.get('pages')
metadata.extra.update(doc.metadata)
return ParsedDocument(
content=content,
metadata=metadata,
raw_text=raw_text,
structured_content=structured_content,
confidence_score=getattr(doc, 'confidence', 1.0)
)
except Exception as e:
raise ParseError(f"Failed to parse document: {str(e)}") from e
def supports_format(self, mime_type: str) -> bool:
"""Check if a given MIME type is supported"""
return mime_type in self.SUPPORTED_FORMATS