Spaces:
Running
Running
import os | |
from pathlib import Path | |
from typing import Optional, Dict, Any, Union | |
import magic | |
from docling.document_converter import DocumentConverter | |
from datetime import datetime | |
from .types import ParsedDocument, DocumentMetadata | |
from .exceptions import UnsupportedFormatError, ParseError | |
class DocumentParser: | |
""" | |
A multiformat document parser using Docling | |
""" | |
SUPPORTED_FORMATS = { | |
'application/pdf': 'pdf', | |
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx', | |
'text/plain': 'txt', | |
'text/html': 'html', | |
'text/markdown': 'md' | |
} | |
def __init__(self, config: Optional[Dict[str, Any]] = None): | |
self.config = config or {} | |
self.converter = DocumentConverter() | |
def parse(self, file_path: Union[str, Path]) -> ParsedDocument: | |
""" | |
Parse a document file and return structured content | |
Args: | |
file_path: Path to the document file | |
Returns: | |
ParsedDocument object containing parsed content and metadata | |
Raises: | |
UnsupportedFormatError: If the file format is not supported | |
ParseError: If parsing fails | |
""" | |
file_path = Path(file_path) | |
if not file_path.exists(): | |
raise FileNotFoundError(f"File not found: {file_path}") | |
mime_type = magic.from_file(str(file_path), mime=True) | |
if mime_type not in self.SUPPORTED_FORMATS: | |
raise UnsupportedFormatError(f"Unsupported file format: {mime_type}") | |
try: | |
# Get file metadata | |
stats = file_path.stat() | |
metadata = DocumentMetadata( | |
filename=file_path.name, | |
file_type=self.SUPPORTED_FORMATS[mime_type], | |
size_bytes=stats.st_size, | |
created_at=datetime.fromtimestamp(stats.st_ctime), | |
modified_at=datetime.fromtimestamp(stats.st_mtime), | |
mime_type=mime_type | |
) | |
# Parse document using Docling | |
result = self.converter.convert(str(file_path)) | |
doc = result.document | |
# Extract content using proper methods | |
content = doc.export_to_text() | |
# Extract structured content | |
structured_content = { | |
'sections': doc.sections if hasattr(doc, 'sections') else [], | |
'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [], | |
'entities': doc.entities if hasattr(doc, 'entities') else {}, | |
'metadata': doc.metadata if hasattr(doc, 'metadata') else {} | |
} | |
# Get raw text if available | |
try: | |
raw_text = doc.export_to_text(include_layout=True) | |
except: | |
raw_text = content | |
# Update metadata with document-specific information | |
if hasattr(doc, 'metadata') and doc.metadata: | |
metadata.title = doc.metadata.get('title') | |
metadata.author = doc.metadata.get('author') | |
metadata.pages = doc.metadata.get('pages') | |
metadata.extra.update(doc.metadata) | |
return ParsedDocument( | |
content=content, | |
metadata=metadata, | |
raw_text=raw_text, | |
structured_content=structured_content, | |
confidence_score=getattr(doc, 'confidence', 1.0) | |
) | |
except Exception as e: | |
raise ParseError(f"Failed to parse document: {str(e)}") from e | |
def supports_format(self, mime_type: str) -> bool: | |
"""Check if a given MIME type is supported""" | |
return mime_type in self.SUPPORTED_FORMATS |