Spaces:

hellorahulk
/

docling_free

Running

File size: 3,777 Bytes

15fdcff
 
 
 
5c197b6
15fdcff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c197b6
15fdcff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c197b6
 
15fdcff
070e4b3
 
 
 
15fdcff
6c30c7d
 
 
 
15fdcff
 
070e4b3
 
 
 
 
 
15fdcff
6c30c7d
15fdcff
 
 
 
 
 
 
 
070e4b3
15fdcff
6c30c7d
15fdcff

import os
from pathlib import Path
from typing import Optional, Dict, Any, Union
import magic
from docling.document_converter import DocumentConverter
from datetime import datetime

from .types import ParsedDocument, DocumentMetadata
from .exceptions import UnsupportedFormatError, ParseError

class DocumentParser:
    """
    A multiformat document parser using Docling
    """
    
    SUPPORTED_FORMATS = {
        'application/pdf': 'pdf',
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
        'text/plain': 'txt',
        'text/html': 'html',
        'text/markdown': 'md'
    }

    def __init__(self, config: Optional[Dict[str, Any]] = None):
        self.config = config or {}
        self.converter = DocumentConverter()

    def parse(self, file_path: Union[str, Path]) -> ParsedDocument:
        """
        Parse a document file and return structured content
        
        Args:
            file_path: Path to the document file
            
        Returns:
            ParsedDocument object containing parsed content and metadata
            
        Raises:
            UnsupportedFormatError: If the file format is not supported
            ParseError: If parsing fails
        """
        file_path = Path(file_path)
        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        mime_type = magic.from_file(str(file_path), mime=True)
        if mime_type not in self.SUPPORTED_FORMATS:
            raise UnsupportedFormatError(f"Unsupported file format: {mime_type}")

        try:
            # Get file metadata
            stats = file_path.stat()
            metadata = DocumentMetadata(
                filename=file_path.name,
                file_type=self.SUPPORTED_FORMATS[mime_type],
                size_bytes=stats.st_size,
                created_at=datetime.fromtimestamp(stats.st_ctime),
                modified_at=datetime.fromtimestamp(stats.st_mtime),
                mime_type=mime_type
            )

            # Parse document using Docling
            result = self.converter.convert(str(file_path))
            doc = result.document
            
            # Extract content using proper methods
            content = doc.export_to_text()
            
            # Extract structured content
            structured_content = {
                'sections': doc.sections if hasattr(doc, 'sections') else [],
                'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [],
                'entities': doc.entities if hasattr(doc, 'entities') else {},
                'metadata': doc.metadata if hasattr(doc, 'metadata') else {}
            }

            # Get raw text if available
            try:
                raw_text = doc.export_to_text(include_layout=True)
            except:
                raw_text = content

            # Update metadata with document-specific information
            if hasattr(doc, 'metadata') and doc.metadata:
                metadata.title = doc.metadata.get('title')
                metadata.author = doc.metadata.get('author')
                metadata.pages = doc.metadata.get('pages')
                metadata.extra.update(doc.metadata)

            return ParsedDocument(
                content=content,
                metadata=metadata,
                raw_text=raw_text,
                structured_content=structured_content,
                confidence_score=getattr(doc, 'confidence', 1.0)
            )

        except Exception as e:
            raise ParseError(f"Failed to parse document: {str(e)}") from e

    def supports_format(self, mime_type: str) -> bool:
        """Check if a given MIME type is supported"""
        return mime_type in self.SUPPORTED_FORMATS