docling_free / app.py
hellorahulk's picture
Fix file handling with filepath type and better error handling
cca0a5d
raw
history blame
5.7 kB
import os
import gradio as gr
import pandas as pd
from dockling_parser import DocumentParser
from dockling_parser.exceptions import ParserError
import tempfile
import mimetypes
TITLE = "πŸ“„ Smart Document Parser"
DESCRIPTION = """
A powerful document parsing application that automatically extracts structured information from various document formats.
Upload any document (PDF, DOCX, TXT, HTML, Markdown) and get structured information extracted automatically.
"""
ARTICLE = """
## πŸš€ Features
- Multiple Format Support: PDF, DOCX, TXT, HTML, and Markdown
- Rich Information Extraction
- Smart Processing with Confidence Scoring
- Automatic Format Detection
Made with ❀️ using Docling and Gradio
"""
# Initialize the document parser
parser = DocumentParser()
def process_document(file_obj):
"""Process uploaded document and return structured information"""
if file_obj is None:
return (
"Error: No file uploaded",
pd.DataFrame(),
"No sections available",
"No entities available",
"Confidence Score: 0.0"
)
temp_path = None
try:
# Create temporary file with appropriate extension
original_filename = file_obj.name if hasattr(file_obj, 'name') else "uploaded_file.pdf"
extension = os.path.splitext(original_filename)[1].lower()
if not extension:
extension = '.pdf' # Default to PDF if no extension
# Create temporary file and write content
with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file:
# Write the content
content = file_obj.read() if hasattr(file_obj, 'read') else file_obj
if isinstance(content, bytes):
tmp_file.write(content)
else:
tmp_file.write(content.encode('utf-8'))
temp_path = tmp_file.name
# Parse the document
result = parser.parse(temp_path)
# Prepare the outputs
metadata_df = pd.DataFrame([{
"Property": k,
"Value": str(v)
} for k, v in result.metadata.dict().items()])
# Extract structured content
sections = result.structured_content.get('sections', [])
sections_text = "\n\n".join([f"Section {i+1}:\n{section}" for i, section in enumerate(sections)])
# Format entities if available
entities = result.structured_content.get('entities', {})
entities_text = "\n".join([f"{entity_type}: {', '.join(entities_list)}"
for entity_type, entities_list in entities.items()]) if entities else "No entities detected"
return (
result.content, # Main content
metadata_df, # Metadata as table
sections_text, # Structured sections
entities_text, # Named entities
f"Confidence Score: {result.confidence_score:.2f}" # Confidence score
)
except ParserError as e:
return (
f"Error parsing document: {str(e)}",
pd.DataFrame(),
"No sections available",
"No entities available",
"Confidence Score: 0.0"
)
except Exception as e:
return (
f"Unexpected error: {str(e)}",
pd.DataFrame(),
"No sections available",
"No entities available",
"Confidence Score: 0.0"
)
finally:
# Clean up temporary file
if temp_path and os.path.exists(temp_path):
try:
os.unlink(temp_path)
except:
pass
# Create Gradio interface
with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
gr.Markdown(f"# {TITLE}")
gr.Markdown(DESCRIPTION)
with gr.Row():
with gr.Column():
file_input = gr.File(
label="Upload Document",
file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
type="filepath" # Changed from binary to filepath
)
submit_btn = gr.Button("Process Document", variant="primary")
with gr.Column():
confidence = gr.Textbox(label="Processing Confidence")
with gr.Tabs():
with gr.TabItem("πŸ“ Content"):
content_output = gr.Textbox(
label="Extracted Content",
lines=10,
max_lines=30
)
with gr.TabItem("πŸ“Š Metadata"):
metadata_output = gr.Dataframe(
label="Document Metadata",
headers=["Property", "Value"]
)
with gr.TabItem("πŸ“‘ Sections"):
sections_output = gr.Textbox(
label="Document Sections",
lines=10,
max_lines=30
)
with gr.TabItem("🏷️ Entities"):
entities_output = gr.Textbox(
label="Named Entities",
lines=5,
max_lines=15
)
# Handle file submission
submit_btn.click(
fn=process_document,
inputs=[file_input],
outputs=[
content_output,
metadata_output,
sections_output,
entities_output,
confidence
]
)
gr.Markdown("""
### πŸ“Œ Supported Formats
- PDF Documents (*.pdf)
- Word Documents (*.docx)
- Text Files (*.txt)
- HTML Files (*.html)
- Markdown Files (*.md)
""")
gr.Markdown(ARTICLE)
# Launch the app
if __name__ == "__main__":
iface.launch()