import os from pathlib import Path from typing import Optional, Dict, Any, Union import magic from docling.document_converter import DocumentConverter from datetime import datetime from .types import ParsedDocument, DocumentMetadata from .exceptions import UnsupportedFormatError, ParseError class DocumentParser: """ A multiformat document parser using Docling """ SUPPORTED_FORMATS = { 'application/pdf': 'pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx', 'text/plain': 'txt', 'text/html': 'html', 'text/markdown': 'md' } def __init__(self, config: Optional[Dict[str, Any]] = None): self.config = config or {} self.converter = DocumentConverter() def parse(self, file_path: Union[str, Path]) -> ParsedDocument: """ Parse a document file and return structured content Args: file_path: Path to the document file Returns: ParsedDocument object containing parsed content and metadata Raises: UnsupportedFormatError: If the file format is not supported ParseError: If parsing fails """ file_path = Path(file_path) if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") mime_type = magic.from_file(str(file_path), mime=True) if mime_type not in self.SUPPORTED_FORMATS: raise UnsupportedFormatError(f"Unsupported file format: {mime_type}") try: # Get file metadata stats = file_path.stat() metadata = DocumentMetadata( filename=file_path.name, file_type=self.SUPPORTED_FORMATS[mime_type], size_bytes=stats.st_size, created_at=datetime.fromtimestamp(stats.st_ctime), modified_at=datetime.fromtimestamp(stats.st_mtime), mime_type=mime_type ) # Parse document using Docling result = self.converter.convert(str(file_path)) doc = result.document # Extract content using proper methods content = doc.export_to_text() # Extract structured content structured_content = { 'sections': doc.sections if hasattr(doc, 'sections') else [], 'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [], 'entities': doc.entities if hasattr(doc, 'entities') else {}, 'metadata': doc.metadata if hasattr(doc, 'metadata') else {} } # Get raw text if available try: raw_text = doc.export_to_text(include_layout=True) except: raw_text = content # Update metadata with document-specific information if hasattr(doc, 'metadata') and doc.metadata: metadata.title = doc.metadata.get('title') metadata.author = doc.metadata.get('author') metadata.pages = doc.metadata.get('pages') metadata.extra.update(doc.metadata) return ParsedDocument( content=content, metadata=metadata, raw_text=raw_text, structured_content=structured_content, confidence_score=getattr(doc, 'confidence', 1.0) ) except Exception as e: raise ParseError(f"Failed to parse document: {str(e)}") from e def supports_format(self, mime_type: str) -> bool: """Check if a given MIME type is supported""" return mime_type in self.SUPPORTED_FORMATS