File size: 3,777 Bytes
15fdcff
 
 
 
5c197b6
15fdcff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c197b6
15fdcff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c197b6
 
15fdcff
070e4b3
 
 
 
15fdcff
6c30c7d
 
 
 
15fdcff
 
070e4b3
 
 
 
 
 
15fdcff
6c30c7d
15fdcff
 
 
 
 
 
 
 
070e4b3
15fdcff
6c30c7d
15fdcff
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
from pathlib import Path
from typing import Optional, Dict, Any, Union
import magic
from docling.document_converter import DocumentConverter
from datetime import datetime

from .types import ParsedDocument, DocumentMetadata
from .exceptions import UnsupportedFormatError, ParseError

class DocumentParser:
    """
    A multiformat document parser using Docling
    """
    
    SUPPORTED_FORMATS = {
        'application/pdf': 'pdf',
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
        'text/plain': 'txt',
        'text/html': 'html',
        'text/markdown': 'md'
    }

    def __init__(self, config: Optional[Dict[str, Any]] = None):
        self.config = config or {}
        self.converter = DocumentConverter()

    def parse(self, file_path: Union[str, Path]) -> ParsedDocument:
        """
        Parse a document file and return structured content
        
        Args:
            file_path: Path to the document file
            
        Returns:
            ParsedDocument object containing parsed content and metadata
            
        Raises:
            UnsupportedFormatError: If the file format is not supported
            ParseError: If parsing fails
        """
        file_path = Path(file_path)
        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        mime_type = magic.from_file(str(file_path), mime=True)
        if mime_type not in self.SUPPORTED_FORMATS:
            raise UnsupportedFormatError(f"Unsupported file format: {mime_type}")

        try:
            # Get file metadata
            stats = file_path.stat()
            metadata = DocumentMetadata(
                filename=file_path.name,
                file_type=self.SUPPORTED_FORMATS[mime_type],
                size_bytes=stats.st_size,
                created_at=datetime.fromtimestamp(stats.st_ctime),
                modified_at=datetime.fromtimestamp(stats.st_mtime),
                mime_type=mime_type
            )

            # Parse document using Docling
            result = self.converter.convert(str(file_path))
            doc = result.document
            
            # Extract content using proper methods
            content = doc.export_to_text()
            
            # Extract structured content
            structured_content = {
                'sections': doc.sections if hasattr(doc, 'sections') else [],
                'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [],
                'entities': doc.entities if hasattr(doc, 'entities') else {},
                'metadata': doc.metadata if hasattr(doc, 'metadata') else {}
            }

            # Get raw text if available
            try:
                raw_text = doc.export_to_text(include_layout=True)
            except:
                raw_text = content

            # Update metadata with document-specific information
            if hasattr(doc, 'metadata') and doc.metadata:
                metadata.title = doc.metadata.get('title')
                metadata.author = doc.metadata.get('author')
                metadata.pages = doc.metadata.get('pages')
                metadata.extra.update(doc.metadata)

            return ParsedDocument(
                content=content,
                metadata=metadata,
                raw_text=raw_text,
                structured_content=structured_content,
                confidence_score=getattr(doc, 'confidence', 1.0)
            )

        except Exception as e:
            raise ParseError(f"Failed to parse document: {str(e)}") from e

    def supports_format(self, mime_type: str) -> bool:
        """Check if a given MIME type is supported"""
        return mime_type in self.SUPPORTED_FORMATS