import os import gradio as gr import pandas as pd from dockling_parser import DocumentParser from dockling_parser.exceptions import ParserError, UnsupportedFormatError import tempfile import mimetypes import traceback import requests from urllib.parse import urlparse TITLE = "📄 Smart Document Parser" DESCRIPTION = """ A powerful document parsing application that automatically extracts structured information from various document formats. Upload a document or provide a URL (PDF, DOCX, TXT, HTML, Markdown) and get structured information automatically. """ ARTICLE = """ ## 🚀 Features - Multiple Format Support: PDF, DOCX, TXT, HTML, and Markdown - Support for File Upload and URLs - Rich Information Extraction - Smart Processing with Confidence Scoring - Automatic Format Detection Made with ❤️ using Docling and Gradio """ ERROR_MESSAGES = { "no_input": ( "⚠️ No input provided", "Please upload a document or provide a URL.", "No sections available", "No entities available", "Confidence Score: 0.0" ), "invalid_url": ( "⚠️ Invalid URL", "Please provide a valid URL to a document.", "No sections available", "No entities available", "Confidence Score: 0.0" ), "download_error": ( "⚠️ Failed to download document", "Could not download the document from the provided URL.", "No sections available", "No entities available", "Confidence Score: 0.0" ), "unsupported_format": ( "⚠️ Unsupported file format", "Please provide a file in one of the supported formats: PDF, DOCX, TXT, HTML, or MD.", "No sections available", "No entities available", "Confidence Score: 0.0" ), "processing_error": ( "⚠️ Error processing document", "An error occurred while processing the document. Please try again with a different file.", "No sections available", "No entities available", "Confidence Score: 0.0" ) } # Initialize the document parser parser = DocumentParser() def download_file(url: str) -> str: """Download file from URL and save to temporary file""" try: # Extract filename from URL parsed_url = urlparse(url) filename = os.path.basename(parsed_url.path) if not filename: filename = "document.pdf" # Default filename # Download file response = requests.get(url, allow_redirects=True) response.raise_for_status() # Save to temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp_file: tmp_file.write(response.content) return tmp_file.name except Exception as e: raise Exception(f"Failed to download file: {str(e)}") def process_input(file_input, url_input): """Process either uploaded file or URL input""" # Check if we have any input if file_input is None and not url_input: return ERROR_MESSAGES["no_input"] temp_file = None try: # Handle URL input if provided if url_input: try: temp_file = download_file(url_input) result = parser.parse(temp_file) except Exception as e: return ERROR_MESSAGES["download_error"] # Handle file upload else: result = parser.parse(file_input) # Prepare the outputs metadata_df = pd.DataFrame([{ "Property": k, "Value": str(v) } for k, v in result.metadata.dict().items()]) # Extract structured content sections = result.structured_content.get('sections', []) sections_text = "\n\n".join([f"Section {i+1}:\n{section}" for i, section in enumerate(sections)]) # Format entities if available entities = result.structured_content.get('entities', {}) entities_text = "\n".join([f"{entity_type}: {', '.join(entities_list)}" for entity_type, entities_list in entities.items()]) if entities else "No entities detected" return ( result.content, # Main content metadata_df, # Metadata as table sections_text, # Structured sections entities_text, # Named entities f"Confidence Score: {result.confidence_score:.2f}" # Confidence score ) except UnsupportedFormatError as e: error_msg = f"⚠️ {str(e)}" return ( error_msg, pd.DataFrame([{"Property": "Error", "Value": error_msg}]), "No sections available", "No entities available", "Confidence Score: 0.0" ) except ParserError as e: error_msg = f"⚠️ {str(e)}" return ( error_msg, pd.DataFrame([{"Property": "Error", "Value": error_msg}]), "No sections available", "No entities available", "Confidence Score: 0.0" ) except Exception as e: error_msg = f"⚠️ Unexpected error: {str(e)}\n{traceback.format_exc()}" return ( error_msg, pd.DataFrame([{"Property": "Error", "Value": error_msg}]), "No sections available", "No entities available", "Confidence Score: 0.0" ) finally: # Cleanup temporary file if it was created if temp_file and os.path.exists(temp_file): try: os.unlink(temp_file) except: pass # Create Gradio interface with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface: gr.Markdown(f"# {TITLE}") gr.Markdown(DESCRIPTION) with gr.Row(): with gr.Column(): file_input = gr.File( label="Upload Document", file_types=[".pdf", ".docx", ".txt", ".html", ".md"], type="filepath" ) url_input = gr.Textbox( label="Or Enter Document URL", placeholder="https://example.com/document.pdf" ) submit_btn = gr.Button("Process Document", variant="primary") with gr.Column(): confidence = gr.Textbox(label="Processing Confidence") with gr.Tabs(): with gr.TabItem("📝 Content"): content_output = gr.Textbox( label="Extracted Content", lines=10, max_lines=30 ) with gr.TabItem("📊 Metadata"): metadata_output = gr.Dataframe( label="Document Metadata", headers=["Property", "Value"] ) with gr.TabItem("📑 Sections"): sections_output = gr.Textbox( label="Document Sections", lines=10, max_lines=30 ) with gr.TabItem("🏷️ Entities"): entities_output = gr.Textbox( label="Named Entities", lines=5, max_lines=15 ) # Handle file submission submit_btn.click( fn=process_input, inputs=[file_input, url_input], outputs=[ content_output, metadata_output, sections_output, entities_output, confidence ] ) gr.Markdown(""" ### 📌 Supported Formats - PDF Documents (*.pdf) - Word Documents (*.docx) - Text Files (*.txt) - HTML Files (*.html) - Markdown Files (*.md) ### 🔗 Example URLs - ArXiv PDFs: https://arxiv.org/pdf/2408.08921.pdf - Research Papers - Documentation """) gr.Markdown(ARTICLE) # Launch the app if __name__ == "__main__": iface.launch()