Spaces:
Running
Running
File size: 3,777 Bytes
15fdcff 5c197b6 15fdcff 5c197b6 15fdcff 5c197b6 15fdcff 070e4b3 15fdcff 6c30c7d 15fdcff 070e4b3 15fdcff 6c30c7d 15fdcff 070e4b3 15fdcff 6c30c7d 15fdcff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import os
from pathlib import Path
from typing import Optional, Dict, Any, Union
import magic
from docling.document_converter import DocumentConverter
from datetime import datetime
from .types import ParsedDocument, DocumentMetadata
from .exceptions import UnsupportedFormatError, ParseError
class DocumentParser:
"""
A multiformat document parser using Docling
"""
SUPPORTED_FORMATS = {
'application/pdf': 'pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
'text/plain': 'txt',
'text/html': 'html',
'text/markdown': 'md'
}
def __init__(self, config: Optional[Dict[str, Any]] = None):
self.config = config or {}
self.converter = DocumentConverter()
def parse(self, file_path: Union[str, Path]) -> ParsedDocument:
"""
Parse a document file and return structured content
Args:
file_path: Path to the document file
Returns:
ParsedDocument object containing parsed content and metadata
Raises:
UnsupportedFormatError: If the file format is not supported
ParseError: If parsing fails
"""
file_path = Path(file_path)
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
mime_type = magic.from_file(str(file_path), mime=True)
if mime_type not in self.SUPPORTED_FORMATS:
raise UnsupportedFormatError(f"Unsupported file format: {mime_type}")
try:
# Get file metadata
stats = file_path.stat()
metadata = DocumentMetadata(
filename=file_path.name,
file_type=self.SUPPORTED_FORMATS[mime_type],
size_bytes=stats.st_size,
created_at=datetime.fromtimestamp(stats.st_ctime),
modified_at=datetime.fromtimestamp(stats.st_mtime),
mime_type=mime_type
)
# Parse document using Docling
result = self.converter.convert(str(file_path))
doc = result.document
# Extract content using proper methods
content = doc.export_to_text()
# Extract structured content
structured_content = {
'sections': doc.sections if hasattr(doc, 'sections') else [],
'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [],
'entities': doc.entities if hasattr(doc, 'entities') else {},
'metadata': doc.metadata if hasattr(doc, 'metadata') else {}
}
# Get raw text if available
try:
raw_text = doc.export_to_text(include_layout=True)
except:
raw_text = content
# Update metadata with document-specific information
if hasattr(doc, 'metadata') and doc.metadata:
metadata.title = doc.metadata.get('title')
metadata.author = doc.metadata.get('author')
metadata.pages = doc.metadata.get('pages')
metadata.extra.update(doc.metadata)
return ParsedDocument(
content=content,
metadata=metadata,
raw_text=raw_text,
structured_content=structured_content,
confidence_score=getattr(doc, 'confidence', 1.0)
)
except Exception as e:
raise ParseError(f"Failed to parse document: {str(e)}") from e
def supports_format(self, mime_type: str) -> bool:
"""Check if a given MIME type is supported"""
return mime_type in self.SUPPORTED_FORMATS |