Spaces:

hellorahulk
/

docling_free

Running

App Files Files Community

docling_free / dockling_parser /parser.py

hellorahulk

Fix document text extraction using proper Docling methods

070e4b3 4 months ago

raw

history blame

3.78 kB

	import os
	from pathlib import Path
	from typing import Optional, Dict, Any, Union
	import magic
	from docling.document_converter import DocumentConverter
	from datetime import datetime

	from .types import ParsedDocument, DocumentMetadata
	from .exceptions import UnsupportedFormatError, ParseError

	class DocumentParser:
	"""
	A multiformat document parser using Docling
	"""

	SUPPORTED_FORMATS = {
	'application/pdf': 'pdf',
	'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
	'text/plain': 'txt',
	'text/html': 'html',
	'text/markdown': 'md'
	}

	def __init__(self, config: Optional[Dict[str, Any]] = None):
	self.config = config or {}
	self.converter = DocumentConverter()

	def parse(self, file_path: Union[str, Path]) -> ParsedDocument:
	"""
	Parse a document file and return structured content

	Args:
	file_path: Path to the document file

	Returns:
	ParsedDocument object containing parsed content and metadata

	Raises:
	UnsupportedFormatError: If the file format is not supported
	ParseError: If parsing fails
	"""
	file_path = Path(file_path)
	if not file_path.exists():
	raise FileNotFoundError(f"File not found: {file_path}")

	mime_type = magic.from_file(str(file_path), mime=True)
	if mime_type not in self.SUPPORTED_FORMATS:
	raise UnsupportedFormatError(f"Unsupported file format: {mime_type}")

	try:
	# Get file metadata
	stats = file_path.stat()
	metadata = DocumentMetadata(
	filename=file_path.name,
	file_type=self.SUPPORTED_FORMATS[mime_type],
	size_bytes=stats.st_size,
	created_at=datetime.fromtimestamp(stats.st_ctime),
	modified_at=datetime.fromtimestamp(stats.st_mtime),
	mime_type=mime_type
	)

	# Parse document using Docling
	result = self.converter.convert(str(file_path))
	doc = result.document

	# Extract content using proper methods
	content = doc.export_to_text()

	# Extract structured content
	structured_content = {
	'sections': doc.sections if hasattr(doc, 'sections') else [],
	'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [],
	'entities': doc.entities if hasattr(doc, 'entities') else {},
	'metadata': doc.metadata if hasattr(doc, 'metadata') else {}
	}

	# Get raw text if available
	try:
	raw_text = doc.export_to_text(include_layout=True)
	except:
	raw_text = content

	# Update metadata with document-specific information
	if hasattr(doc, 'metadata') and doc.metadata:
	metadata.title = doc.metadata.get('title')
	metadata.author = doc.metadata.get('author')
	metadata.pages = doc.metadata.get('pages')
	metadata.extra.update(doc.metadata)

	return ParsedDocument(
	content=content,
	metadata=metadata,
	raw_text=raw_text,
	structured_content=structured_content,
	confidence_score=getattr(doc, 'confidence', 1.0)
	)

	except Exception as e:
	raise ParseError(f"Failed to parse document: {str(e)}") from e

	def supports_format(self, mime_type: str) -> bool:
	"""Check if a given MIME type is supported"""
	return mime_type in self.SUPPORTED_FORMATS