Spaces:

hellorahulk
/

docling_free

Running

App Files Files Community

docling_free / app.py

hellorahulk

Fix file handling with filepath type and better error handling

cca0a5d 4 months ago

raw

history blame

5.7 kB

	import os
	import gradio as gr
	import pandas as pd
	from dockling_parser import DocumentParser
	from dockling_parser.exceptions import ParserError
	import tempfile
	import mimetypes

	TITLE = "📄 Smart Document Parser"
	DESCRIPTION = """
	A powerful document parsing application that automatically extracts structured information from various document formats.
	Upload any document (PDF, DOCX, TXT, HTML, Markdown) and get structured information extracted automatically.
	"""

	ARTICLE = """
	## 🚀 Features

	- Multiple Format Support: PDF, DOCX, TXT, HTML, and Markdown
	- Rich Information Extraction
	- Smart Processing with Confidence Scoring
	- Automatic Format Detection

	Made with ❤️ using Docling and Gradio
	"""

	# Initialize the document parser
	parser = DocumentParser()

	def process_document(file_obj):
	"""Process uploaded document and return structured information"""
	if file_obj is None:
	return (
	"Error: No file uploaded",
	pd.DataFrame(),
	"No sections available",
	"No entities available",
	"Confidence Score: 0.0"
	)

	temp_path = None
	try:
	# Create temporary file with appropriate extension
	original_filename = file_obj.name if hasattr(file_obj, 'name') else "uploaded_file.pdf"
	extension = os.path.splitext(original_filename)[1].lower()
	if not extension:
	extension = '.pdf' # Default to PDF if no extension

	# Create temporary file and write content
	with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file:
	# Write the content
	content = file_obj.read() if hasattr(file_obj, 'read') else file_obj
	if isinstance(content, bytes):
	tmp_file.write(content)
	else:
	tmp_file.write(content.encode('utf-8'))
	temp_path = tmp_file.name

	# Parse the document
	result = parser.parse(temp_path)

	# Prepare the outputs
	metadata_df = pd.DataFrame([{
	"Property": k,
	"Value": str(v)
	} for k, v in result.metadata.dict().items()])

	# Extract structured content
	sections = result.structured_content.get('sections', [])
	sections_text = "\n\n".join([f"Section {i+1}:\n{section}" for i, section in enumerate(sections)])

	# Format entities if available
	entities = result.structured_content.get('entities', {})
	entities_text = "\n".join([f"{entity_type}: {', '.join(entities_list)}"
	for entity_type, entities_list in entities.items()]) if entities else "No entities detected"

	return (
	result.content, # Main content
	metadata_df, # Metadata as table
	sections_text, # Structured sections
	entities_text, # Named entities
	f"Confidence Score: {result.confidence_score:.2f}" # Confidence score
	)

	except ParserError as e:
	return (
	f"Error parsing document: {str(e)}",
	pd.DataFrame(),
	"No sections available",
	"No entities available",
	"Confidence Score: 0.0"
	)
	except Exception as e:
	return (
	f"Unexpected error: {str(e)}",
	pd.DataFrame(),
	"No sections available",
	"No entities available",
	"Confidence Score: 0.0"
	)
	finally:
	# Clean up temporary file
	if temp_path and os.path.exists(temp_path):
	try:
	os.unlink(temp_path)
	except:
	pass

	# Create Gradio interface
	with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
	gr.Markdown(f"# {TITLE}")
	gr.Markdown(DESCRIPTION)

	with gr.Row():
	with gr.Column():
	file_input = gr.File(
	label="Upload Document",
	file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
	type="filepath" # Changed from binary to filepath
	)
	submit_btn = gr.Button("Process Document", variant="primary")

	with gr.Column():
	confidence = gr.Textbox(label="Processing Confidence")

	with gr.Tabs():
	with gr.TabItem("📝 Content"):
	content_output = gr.Textbox(
	label="Extracted Content",
	lines=10,
	max_lines=30
	)

	with gr.TabItem("📊 Metadata"):
	metadata_output = gr.Dataframe(
	label="Document Metadata",
	headers=["Property", "Value"]
	)

	with gr.TabItem("📑 Sections"):
	sections_output = gr.Textbox(
	label="Document Sections",
	lines=10,
	max_lines=30
	)

	with gr.TabItem("🏷️ Entities"):
	entities_output = gr.Textbox(
	label="Named Entities",
	lines=5,
	max_lines=15
	)

	# Handle file submission
	submit_btn.click(
	fn=process_document,
	inputs=[file_input],
	outputs=[
	content_output,
	metadata_output,
	sections_output,
	entities_output,
	confidence
	]
	)

	gr.Markdown("""
	### 📌 Supported Formats
	- PDF Documents (*.pdf)
	- Word Documents (*.docx)
	- Text Files (*.txt)
	- HTML Files (*.html)
	- Markdown Files (*.md)
	""")

	gr.Markdown(ARTICLE)

	# Launch the app
	if __name__ == "__main__":
	iface.launch()